Changeset: ca1500dcb7c4 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/ca1500dcb7c4
Modified Files:
clients/Tests/MAL-signatures-hge.test
clients/Tests/MAL-signatures.test
monetdb5/modules/mal/txtsim.c
Branch: txtsim
Log Message:
Add comments back and approve the signatures
diffs (truncated from 310 to 300 lines):
diff --git a/clients/Tests/MAL-signatures-hge.test
b/clients/Tests/MAL-signatures-hge.test
--- a/clients/Tests/MAL-signatures-hge.test
+++ b/clients/Tests/MAL-signatures-hge.test
@@ -35507,17 +35507,17 @@ battxtsim
maxlevenshtein
pattern battxtsim.maxlevenshtein(X_0:bat[:str], X_1:bat[:str],
X_2:int):bat[:bit]
BATTXTSIMmaxlevenshtein;
-(empty)
+Same as maxlevenshtein but for BATS
battxtsim
maxlevenshtein
pattern battxtsim.maxlevenshtein(X_0:bat[:str], X_1:bat[:str], X_2:int,
X_3:int, X_4:int):bat[:bit]
BATTXTSIMmaxlevenshtein;
-(empty)
+Same as maxlevenshtein but for BATS
battxtsim
similarity
command battxtsim.similarity(X_0:bat[:str], X_1:bat[:str]):bat[:dbl]
fstrcmp0_impl_bulk;
-(empty)
+(Deprecated) Normalized edit distance between two strings
baturl
extractURLHost
command baturl.extractURLHost(X_0:bat[:str], X_1:bit):bat[:str]
@@ -51007,87 +51007,87 @@ txtsim
dameraulevenshtein
pattern txtsim.dameraulevenshtein(X_0:str, X_1:str):int
TXTSIMdameraulevenshtein;
-(empty)
+Calculates Damerau-Levenshtein distance between two strings, operation costs
(ins/del = 1, replacement = 1, transposition = 2)
txtsim
dameraulevenshtein
pattern txtsim.dameraulevenshtein(X_0:str, X_1:str, X_2:int, X_3:int,
X_4:int):int
TXTSIMdameraulevenshtein;
-(empty)
+Calculates Damerau-Levenshtein distance between two strings, variable
operation costs (ins/del, replacement, transposition)
txtsim
editdistance
command txtsim.editdistance(X_0:str, X_1:str):int
TXTSIMdameraulevenshtein1;
-(empty)
+Alias for Damerau-Levenshtein(str,str), insdel cost = 1, replace cost = 1 and
transpose = 2
txtsim
editdistance2
command txtsim.editdistance2(X_0:str, X_1:str):int
TXTSIMdameraulevenshtein2;
-(empty)
+Alias for Damerau-Levenshtein(str,str), insdel cost = 1, replace cost = 1 and
transpose = 1
txtsim
jaro_winkler_similarity
command txtsim.jaro_winkler_similarity(X_0:str, X_1:str):dbl
jaro_winkler_similarity;
-(empty)
+Calculate Jaro Winkler similarity
txtsim
levenshtein
pattern txtsim.levenshtein(X_0:str, X_1:str):int
TXTSIMlevenshtein;
-(empty)
+Calculates Levenshtein distance between two strings, operation costs (ins/del
= 1, replacement = 1)
txtsim
levenshtein
pattern txtsim.levenshtein(X_0:str, X_1:str, X_2:int, X_3:int):int
TXTSIMlevenshtein;
-(empty)
+Calculates Levenshtein distance between two strings, variable operation costs
(ins/del, replacement)
txtsim
levenshtein
pattern txtsim.levenshtein(X_0:str, X_1:str, X_2:int, X_3:int, X_4:int):int
TXTSIMlevenshtein;
-(empty)
+(Backwards compatibility purposes) Calculates Damerau-Levenshtein distance
between two strings, variable operation costs (ins/del, replacement,
transposition)
txtsim
maxlevenshtein
pattern txtsim.maxlevenshtein(X_0:str, X_1:str, X_2:int):int
TXTSIMmaxlevenshtein;
-(empty)
+Levenshtein distance with basic costs but up to a MAX
txtsim
maxlevenshtein
pattern txtsim.maxlevenshtein(X_0:str, X_1:str, X_2:int, X_3:int, X_4:int):int
TXTSIMmaxlevenshtein;
-(empty)
+Levenshtein distance with variable costs but up to a MAX
txtsim
qgramnormalize
command txtsim.qgramnormalize(X_0:str):str
qgram_normalize;
-(empty)
+'Normalizes' strings (eg. toUpper and replaces non-alphanumerics with one space
txtsim
qgramselfjoin
command txtsim.qgramselfjoin(X_0:bat[:oid], X_1:bat[:oid], X_2:bat[:int],
X_3:bat[:int], X_4:flt, X_5:int) (X_6:bat[:int], X_7:bat[:int])
qgram_selfjoin;
-(empty)
+QGram self-join on ordered(!) qgram tables and sub-ordered q-gram positions
txtsim
similarity
command txtsim.similarity(X_0:str, X_1:str):dbl
fstrcmp0_impl;
-(empty)
+(Deprecated) Normalized edit distance between two strings
txtsim
similarity
command txtsim.similarity(X_0:str, X_1:str, X_2:dbl):dbl
fstrcmp_impl;
-(empty)
+(Deprecated) Normalized edit distance between two strings
txtsim
soundex
command txtsim.soundex(X_0:str):str
soundex;
-(empty)
+Soundex function for phonetic matching
txtsim
str2qgrams
command txtsim.str2qgrams(X_0:str):bat[:str]
str_2_qgrams;
-(empty)
+Break the string into 4-grams
txtsim
stringdiff
command txtsim.stringdiff(X_0:str, X_1:str):int
stringdiff;
-(empty)
+Calculate the soundexed editdistance
url
extractURLHost
command url.extractURLHost(X_0:str, X_1:bit):str
diff --git a/clients/Tests/MAL-signatures.test
b/clients/Tests/MAL-signatures.test
--- a/clients/Tests/MAL-signatures.test
+++ b/clients/Tests/MAL-signatures.test
@@ -26557,17 +26557,17 @@ battxtsim
maxlevenshtein
pattern battxtsim.maxlevenshtein(X_0:bat[:str], X_1:bat[:str],
X_2:int):bat[:bit]
BATTXTSIMmaxlevenshtein;
-(empty)
+Same as maxlevenshtein but for BATS
battxtsim
maxlevenshtein
pattern battxtsim.maxlevenshtein(X_0:bat[:str], X_1:bat[:str], X_2:int,
X_3:int, X_4:int):bat[:bit]
BATTXTSIMmaxlevenshtein;
-(empty)
+Same as maxlevenshtein but for BATS
battxtsim
similarity
command battxtsim.similarity(X_0:bat[:str], X_1:bat[:str]):bat[:dbl]
fstrcmp0_impl_bulk;
-(empty)
+(Deprecated) Normalized edit distance between two strings
baturl
extractURLHost
command baturl.extractURLHost(X_0:bat[:str], X_1:bit):bat[:str]
@@ -39332,87 +39332,87 @@ txtsim
dameraulevenshtein
pattern txtsim.dameraulevenshtein(X_0:str, X_1:str):int
TXTSIMdameraulevenshtein;
-(empty)
+Calculates Damerau-Levenshtein distance between two strings, operation costs
(ins/del = 1, replacement = 1, transposition = 2)
txtsim
dameraulevenshtein
pattern txtsim.dameraulevenshtein(X_0:str, X_1:str, X_2:int, X_3:int,
X_4:int):int
TXTSIMdameraulevenshtein;
-(empty)
+Calculates Damerau-Levenshtein distance between two strings, variable
operation costs (ins/del, replacement, transposition)
txtsim
editdistance
command txtsim.editdistance(X_0:str, X_1:str):int
TXTSIMdameraulevenshtein1;
-(empty)
+Alias for Damerau-Levenshtein(str,str), insdel cost = 1, replace cost = 1 and
transpose = 2
txtsim
editdistance2
command txtsim.editdistance2(X_0:str, X_1:str):int
TXTSIMdameraulevenshtein2;
-(empty)
+Alias for Damerau-Levenshtein(str,str), insdel cost = 1, replace cost = 1 and
transpose = 1
txtsim
jaro_winkler_similarity
command txtsim.jaro_winkler_similarity(X_0:str, X_1:str):dbl
jaro_winkler_similarity;
-(empty)
+Calculate Jaro Winkler similarity
txtsim
levenshtein
pattern txtsim.levenshtein(X_0:str, X_1:str):int
TXTSIMlevenshtein;
-(empty)
+Calculates Levenshtein distance between two strings, operation costs (ins/del
= 1, replacement = 1)
txtsim
levenshtein
pattern txtsim.levenshtein(X_0:str, X_1:str, X_2:int, X_3:int):int
TXTSIMlevenshtein;
-(empty)
+Calculates Levenshtein distance between two strings, variable operation costs
(ins/del, replacement)
txtsim
levenshtein
pattern txtsim.levenshtein(X_0:str, X_1:str, X_2:int, X_3:int, X_4:int):int
TXTSIMlevenshtein;
-(empty)
+(Backwards compatibility purposes) Calculates Damerau-Levenshtein distance
between two strings, variable operation costs (ins/del, replacement,
transposition)
txtsim
maxlevenshtein
pattern txtsim.maxlevenshtein(X_0:str, X_1:str, X_2:int):int
TXTSIMmaxlevenshtein;
-(empty)
+Levenshtein distance with basic costs but up to a MAX
txtsim
maxlevenshtein
pattern txtsim.maxlevenshtein(X_0:str, X_1:str, X_2:int, X_3:int, X_4:int):int
TXTSIMmaxlevenshtein;
-(empty)
+Levenshtein distance with variable costs but up to a MAX
txtsim
qgramnormalize
command txtsim.qgramnormalize(X_0:str):str
qgram_normalize;
-(empty)
+'Normalizes' strings (eg. toUpper and replaces non-alphanumerics with one space
txtsim
qgramselfjoin
command txtsim.qgramselfjoin(X_0:bat[:oid], X_1:bat[:oid], X_2:bat[:int],
X_3:bat[:int], X_4:flt, X_5:int) (X_6:bat[:int], X_7:bat[:int])
qgram_selfjoin;
-(empty)
+QGram self-join on ordered(!) qgram tables and sub-ordered q-gram positions
txtsim
similarity
command txtsim.similarity(X_0:str, X_1:str):dbl
fstrcmp0_impl;
-(empty)
+"(Deprecated) Normalized edit distance between two strings"
txtsim
similarity
command txtsim.similarity(X_0:str, X_1:str, X_2:dbl):dbl
fstrcmp_impl;
-(empty)
+(Deprecated) Normalized edit distance between two strings
txtsim
soundex
command txtsim.soundex(X_0:str):str
soundex;
-(empty)
+Soundex function for phonetic matching
txtsim
str2qgrams
command txtsim.str2qgrams(X_0:str):bat[:str]
str_2_qgrams;
-(empty)
+Break the string into 4-grams
txtsim
stringdiff
command txtsim.stringdiff(X_0:str, X_1:str):int
stringdiff;
-(empty)
+Calculate the soundexed editdistance
url
extractURLHost
command url.extractURLHost(X_0:str, X_1:bit):str
diff --git a/monetdb5/modules/mal/txtsim.c b/monetdb5/modules/mal/txtsim.c
--- a/monetdb5/modules/mal/txtsim.c
+++ b/monetdb5/modules/mal/txtsim.c
@@ -1459,28 +1459,28 @@ fstrcmp0_impl_bulk(bat *res, bat *string
#include "mel.h"
mel_func txtsim_init_funcs[] = {
- pattern("txtsim", "dameraulevenshtein", TXTSIMdameraulevenshtein,
false, "", args(1,3,arg("",int),arg("x",str),arg("y",str))),
- pattern("txtsim", "dameraulevenshtein", TXTSIMdameraulevenshtein,
false, "",
args(1,6,arg("",int),arg("x",str),arg("y",str),arg("insdel_cost",int),arg("replace_cost",int),arg("transpose_cost",int))),
- command("txtsim", "editdistance", TXTSIMdameraulevenshtein1, false, "",
args(1,3, arg("",int),arg("s",str),arg("t",str))),
- command("txtsim", "editdistance2", TXTSIMdameraulevenshtein2, false,
"", args(1,3, arg("",int),arg("s",str),arg("t",str))),
- pattern("txtsim", "levenshtein", TXTSIMlevenshtein, false, "",
args(1,3,arg("",int),arg("s",str),arg("t",str))),
- pattern("txtsim", "levenshtein", TXTSIMlevenshtein, false, "",
args(1,5,arg("",int),arg("x",str),arg("y",str),arg("insdel_cost",int),arg("replace_cost",int))),
- pattern("txtsim", "levenshtein", TXTSIMlevenshtein, false, "",
args(1,6,arg("",int),arg("x",str),arg("y",str),arg("insdel_cost",int),arg("replace_cost",int),arg("transpose_cost",int))),
- pattern("txtsim", "maxlevenshtein", TXTSIMmaxlevenshtein, false, "",
args(1, 4, arg("",int), arg("l",str),arg("r",str),arg("k",int))),
- pattern("txtsim", "maxlevenshtein", TXTSIMmaxlevenshtein, false, "",
args(1, 6, arg("",int),
arg("l",str),arg("r",str),arg("k",int),arg("insdel_cost",int),arg("replace_cost",int))),
- pattern("battxtsim", "maxlevenshtein", BATTXTSIMmaxlevenshtein, false,
"", args(1, 4, batarg("",bit), batarg("l",str),batarg("r",str),arg("k",int))),
- pattern("battxtsim", "maxlevenshtein", BATTXTSIMmaxlevenshtein, false,
"", args(1, 6, batarg("",bit),
batarg("l",str),batarg("r",str),arg("k",int),arg("insdel_cost",int),arg("replace_cost",int))),
+ pattern("txtsim", "dameraulevenshtein", TXTSIMdameraulevenshtein,
false, "Calculates Damerau-Levenshtein distance between two strings, operation
costs (ins/del = 1, replacement = 1, transposition = 2)",
args(1,3,arg("",int),arg("x",str),arg("y",str))),
+ pattern("txtsim", "dameraulevenshtein", TXTSIMdameraulevenshtein,
false, "Calculates Damerau-Levenshtein distance between two strings, variable
operation costs (ins/del, replacement, transposition)",
args(1,6,arg("",int),arg("x",str),arg("y",str),arg("insdel_cost",int),arg("replace_cost",int),arg("transpose_cost",int))),
+ command("txtsim", "editdistance", TXTSIMdameraulevenshtein1, false,
"Alias for Damerau-Levenshtein(str,str), insdel cost = 1, replace cost = 1 and
transpose = 2", args(1,3, arg("",int),arg("s",str),arg("t",str))),
+ command("txtsim", "editdistance2", TXTSIMdameraulevenshtein2, false,
"Alias for Damerau-Levenshtein(str,str), insdel cost = 1, replace cost = 1 and
transpose = 1", args(1,3, arg("",int),arg("s",str),arg("t",str))),
+ pattern("txtsim", "levenshtein", TXTSIMlevenshtein, false, "Calculates
Levenshtein distance between two strings, operation costs (ins/del = 1,
replacement = 1)", args(1,3,arg("",int),arg("s",str),arg("t",str))),
+ pattern("txtsim", "levenshtein", TXTSIMlevenshtein, false, "Calculates
Levenshtein distance between two strings, variable operation costs (ins/del,
replacement)",
args(1,5,arg("",int),arg("x",str),arg("y",str),arg("insdel_cost",int),arg("replace_cost",int))),
+ pattern("txtsim", "levenshtein", TXTSIMlevenshtein, false, "(Backwards
compatibility purposes) Calculates Damerau-Levenshtein distance between two
strings, variable operation costs (ins/del, replacement, transposition)",
args(1,6,arg("",int),arg("x",str),arg("y",str),arg("insdel_cost",int),arg("replace_cost",int),arg("transpose_cost",int))),
+ pattern("txtsim", "maxlevenshtein", TXTSIMmaxlevenshtein, false,
"Levenshtein distance with basic costs but up to a MAX", args(1, 4,
arg("",int), arg("l",str),arg("r",str),arg("k",int))),
+ pattern("txtsim", "maxlevenshtein", TXTSIMmaxlevenshtein, false,
"Levenshtein distance with variable costs but up to a MAX", args(1, 6,
arg("",int),
arg("l",str),arg("r",str),arg("k",int),arg("insdel_cost",int),arg("replace_cost",int))),
+ pattern("battxtsim", "maxlevenshtein", BATTXTSIMmaxlevenshtein, false,
"Same as maxlevenshtein but for BATS", args(1, 4, batarg("",bit),
batarg("l",str),batarg("r",str),arg("k",int))),
+ pattern("battxtsim", "maxlevenshtein", BATTXTSIMmaxlevenshtein, false,
"Same as maxlevenshtein but for BATS", args(1, 6, batarg("",bit),
batarg("l",str),batarg("r",str),arg("k",int),arg("insdel_cost",int),arg("replace_cost",int))),
/* command("battxtsim", "maxlevenshteinselect",
TXTSIMmaxlevenshteinselect, false, "", args(1,6,
batarg("",oid),batarg("b",str),batarg("s",oid),arg("anti",bit))), */
/* command("battxtsim", "maxlevenshteinjoin", TXTSIMmaxlevenshteinjoin,
false, "", args(2,10,
batarg("",oid),batarg("",oid),batarg("l",str),batarg("r",str),batarg("sl",oid),batarg("sr",oid),arg("nil_matches",bit),arg("estimate",lng),arg("anti",bit))),
*/
- command("txtsim", "soundex", soundex, false, "", args(1,2,
arg("",str),arg("name",str))),
- command("txtsim", "stringdiff", stringdiff, false, "", args(1,3,
arg("",int),arg("s1",str),arg("s2",str))),
- command("txtsim", "qgramnormalize", qgram_normalize, false, "",
args(1,2, arg("",str),arg("input",str))),
- command("txtsim", "qgramselfjoin", qgram_selfjoin, false, "", args(2,8,
batarg("",int),batarg("",int),batarg("qgram",oid),batarg("id",oid),batarg("pos",int),batarg("len",int),arg("c",flt),arg("k",int))),
- command("txtsim", "str2qgrams", str_2_qgrams, false, "", args(1,2,
batarg("",str),arg("s",str))),
- command("txtsim", "jaro_winkler_similarity", jaro_winkler_similarity,
false, "", args(1,3, arg("",dbl),arg("x",str),arg("y",str))),
- command("txtsim", "similarity", fstrcmp_impl, false, "", args(1,4,
arg("",dbl),arg("string1",str),arg("string2",str),arg("minimum",dbl))),
- command("txtsim", "similarity", fstrcmp0_impl, false, "", args(1,3,
arg("",dbl),arg("string1",str),arg("string2",str))),
- command("battxtsim", "similarity", fstrcmp0_impl_bulk, false, "",
args(1,3, batarg("",dbl),batarg("string1",str),batarg("string2",str))),
+ command("txtsim", "soundex", soundex, false, "Soundex function for
phonetic matching", args(1,2, arg("",str),arg("name",str))),
+ command("txtsim", "stringdiff", stringdiff, false, "Calculate the
soundexed editdistance", args(1,3, arg("",int),arg("s1",str),arg("s2",str))),
_______________________________________________
checkin-list mailing list -- [email protected]
To unsubscribe send an email to [email protected]