Changeset: 6f8d0855d30c for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=6f8d0855d30c
Added Files:
monetdb5/modules/mal/Tests/qgram.mal
Modified Files:
monetdb5/modules/mal/Tests/All
monetdb5/modules/mal/txtsim.c
monetdb5/modules/mal/txtsim.h
monetdb5/modules/mal/txtsim.mal
Branch: default
Log Message:
Turn MAL function into C variant
diffs (148 lines):
diff --git a/monetdb5/modules/mal/Tests/All b/monetdb5/modules/mal/Tests/All
--- a/monetdb5/modules/mal/Tests/All
+++ b/monetdb5/modules/mal/Tests/All
@@ -16,6 +16,8 @@ mserver00
modulechk.mal
+qgram
+
# statistics00
#statistics01
diff --git a/monetdb5/modules/mal/Tests/qgram.mal
b/monetdb5/modules/mal/Tests/qgram.mal
new file mode 100644
--- /dev/null
+++ b/monetdb5/modules/mal/Tests/qgram.mal
@@ -0,0 +1,2 @@
+b := txtsim.str2qgrams("hello world");
+io.print(b);
diff --git a/monetdb5/modules/mal/txtsim.c b/monetdb5/modules/mal/txtsim.c
--- a/monetdb5/modules/mal/txtsim.c
+++ b/monetdb5/modules/mal/txtsim.c
@@ -924,3 +924,26 @@ CMDqgramselfjoin(BAT **res, BAT *qgram,
throw(MAL, "txtsim.qgramselfjoin", MAL_MALLOC_FAIL);
}
+str
+CMDstr2qgrams(int *ret, str *val)
+{
+ BAT *bn;
+ int i, len = (int)strlen(*val) +4;
+ str s = GDKzalloc( len);
+ char qgram[4];
+
+ s[0]=0;
+ strcat(s,"##");
+ strcat(s,*val);
+ strcat(s,"$$");
+ qgram[3]=0;
+ bn = BATnew(TYPE_void, TYPE_str, (int) strlen(*val));
+ BATseqbase(bn,0);
+
+ for ( i= 0; i< len -4; i++){
+ strncpy(qgram,s+i,4);
+ BUNappend(bn,qgram,FALSE);
+ }
+ BBPkeepref(*ret = bn->batCacheid);
+ return MAL_SUCCEED;
+}
diff --git a/monetdb5/modules/mal/txtsim.h b/monetdb5/modules/mal/txtsim.h
--- a/monetdb5/modules/mal/txtsim.h
+++ b/monetdb5/modules/mal/txtsim.h
@@ -52,6 +52,7 @@ txtsim_export str soundex_impl(str *res,
txtsim_export str stringdiff_impl(int *res, str *s1, str*s2);
txtsim_export str CMDqgramnormalize(str *res, str *input);
txtsim_export str CMDqgramselfjoin(BAT **res, BAT *qgram, BAT *id, BAT *pos,
BAT *len, flt *c, int *k);
+txtsim_export str CMDstr2qgrams(int *ret, str *val);
#endif /*_TXTSIM_H*/
diff --git a/monetdb5/modules/mal/txtsim.mal b/monetdb5/modules/mal/txtsim.mal
--- a/monetdb5/modules/mal/txtsim.mal
+++ b/monetdb5/modules/mal/txtsim.mal
@@ -57,81 +57,6 @@ command qgramselfjoin(qgram:bat[:oid, :o
address CMDqgramselfjoin
comment "QGram self-join on ordered(!) qgram tables and sub-ordered q-gram
positions";
-# @-
-# #mil implementation
-#
-# #proc str2qgrams(str s) : bat[oid,str]
-# #{
-# # s := qgramnormalize(s);
-# #
-# # var len := s.length();
-# # var b := bat(str,void,len+4);
-# # var i := 0;
-# # var last := " ";
-# #
-# # b.insert("#",nil);
-# # b.insert("#",nil);
-# # while(i < len) {
-# # b.insert(s.string(i,1), nil);
-# # if (b.count() > 32764) break; # ignore suffix of >32KB strings
-# # i :+= 1;
-# # }
-# # b.insert("$",nil);
-# # b.insert("$",nil);
-# # b.reverse().seqbase(0@0);
-# #
-# # var c0 := b.select(0@0, oid(b.count() - 3)).mark(0@0).reverse();
-# # var c1 := b.select(1@0, oid(b.count() - 2)).mark(0@0).reverse();
-# # var c2 := b.select(2@0, oid(b.count() - 1)).mark(0@0).reverse();
-# # return [+](c0, [+](c1, c2));
-# #}
-# #mal implementation
-function txtsim.str2qgrams{inline}(s:str):bat[:oid,:str];
- s := txtsim.qgramnormalize(s);
-
- l := calc.length(s);
- len := calc.lng(l);
- len := calc.+(len,4);
- b := bat.new(:str,:oid,len);
- i := 0;
- last := " ";
- bat.insert(b,"#",nil:oid);
- bat.insert(b,"#",nil:oid);
- cnt:= 0:wrd;
- id := calc.oid(cnt);
- barrier B:= calc.<(i,len);
- sstr := str.string(s,i,1);
- bat.insert(b,sstr, nil:oid);
- cnt := aggr.count(b);
- leave B:= calc.>(cnt,32764); # ignore suffix of >32KB strings
- i := calc.+(i,1);
- redo B:= calc.<(i,len);
- exit B;
- bat.insert(b,"$",nil:oid);
- bat.insert(b,"$",nil:oid);
- rev := bat.reverse (b);
- alg := algebra.markH(rev, 0@0);
- cnt := aggr.count(alg);
- cnt := calc.-(cnt,3);
- sel := algebra.select(b, 0@0, id);
- mrk := algebra.markT(sel, 0@0);
- c0 := bat.reverse(mrk);
-
- cnt := aggr.count(alg);
- cnt := calc.-(cnt,2);
- id := calc.oid(cnt);
- sel := algebra.select(b, 1@0, id);
- mrk := algebra.markT(sel, 0@0);
- c1 := bat.reverse(mrk);
- cnt := aggr.count(alg);
- cnt := calc.-(cnt,1);
- id := calc.oid(cnt);
- sel := algebra.select(b, 2@0, id);
- mrk := algebra.markT(sel, 0@0);
- c2 := bat.reverse(mrk);
-
- res := batcalc.+(c1, c2);
- res := batcalc.+(c0, res);
-
- return str2qgrams := res;
-end str2qgrams;
+command txtsim.str2qgrams(s:str):bat[:oid,:str]
+address CMDstr2qgrams
+comment "Break the string into 4-grams";
_______________________________________________
Checkin-list mailing list
[email protected]
http://mail.monetdb.org/mailman/listinfo/checkin-list