Changeset: 6f8d0855d30c for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=6f8d0855d30c
Added Files:
        monetdb5/modules/mal/Tests/qgram.mal
Modified Files:
        monetdb5/modules/mal/Tests/All
        monetdb5/modules/mal/txtsim.c
        monetdb5/modules/mal/txtsim.h
        monetdb5/modules/mal/txtsim.mal
Branch: default
Log Message:

Turn MAL function into C variant


diffs (148 lines):

diff --git a/monetdb5/modules/mal/Tests/All b/monetdb5/modules/mal/Tests/All
--- a/monetdb5/modules/mal/Tests/All
+++ b/monetdb5/modules/mal/Tests/All
@@ -16,6 +16,8 @@ mserver00
 
 modulechk.mal
 
+qgram
+
 # statistics00
 #statistics01
 
diff --git a/monetdb5/modules/mal/Tests/qgram.mal 
b/monetdb5/modules/mal/Tests/qgram.mal
new file mode 100644
--- /dev/null
+++ b/monetdb5/modules/mal/Tests/qgram.mal
@@ -0,0 +1,2 @@
+b := txtsim.str2qgrams("hello world");
+io.print(b);
diff --git a/monetdb5/modules/mal/txtsim.c b/monetdb5/modules/mal/txtsim.c
--- a/monetdb5/modules/mal/txtsim.c
+++ b/monetdb5/modules/mal/txtsim.c
@@ -924,3 +924,26 @@ CMDqgramselfjoin(BAT **res, BAT *qgram, 
        throw(MAL, "txtsim.qgramselfjoin", MAL_MALLOC_FAIL);
 }
 
+str
+CMDstr2qgrams(int *ret, str *val)
+{
+       BAT *bn;
+       int i, len = (int)strlen(*val) +4;
+       str s = GDKzalloc( len);
+       char qgram[4];
+
+       s[0]=0;
+       strcat(s,"##");
+       strcat(s,*val);
+       strcat(s,"$$");
+       qgram[3]=0;
+       bn = BATnew(TYPE_void, TYPE_str, (int) strlen(*val));
+       BATseqbase(bn,0);
+       
+       for ( i= 0; i< len -4; i++){
+               strncpy(qgram,s+i,4);
+               BUNappend(bn,qgram,FALSE);
+       }
+       BBPkeepref(*ret = bn->batCacheid);
+       return MAL_SUCCEED;
+}
diff --git a/monetdb5/modules/mal/txtsim.h b/monetdb5/modules/mal/txtsim.h
--- a/monetdb5/modules/mal/txtsim.h
+++ b/monetdb5/modules/mal/txtsim.h
@@ -52,6 +52,7 @@ txtsim_export str soundex_impl(str *res,
 txtsim_export str stringdiff_impl(int *res, str *s1, str*s2);
 txtsim_export str CMDqgramnormalize(str *res, str *input);
 txtsim_export str CMDqgramselfjoin(BAT **res, BAT *qgram, BAT *id, BAT *pos, 
BAT *len, flt *c, int *k);
+txtsim_export str CMDstr2qgrams(int *ret, str *val);
 
 #endif /*_TXTSIM_H*/
 
diff --git a/monetdb5/modules/mal/txtsim.mal b/monetdb5/modules/mal/txtsim.mal
--- a/monetdb5/modules/mal/txtsim.mal
+++ b/monetdb5/modules/mal/txtsim.mal
@@ -57,81 +57,6 @@ command qgramselfjoin(qgram:bat[:oid, :o
 address CMDqgramselfjoin
 comment "QGram self-join on ordered(!) qgram tables and sub-ordered q-gram 
positions";
 
-# @-
-# #mil implementation
-#
-# #proc str2qgrams(str s) : bat[oid,str]
-# #{
-# #    s := qgramnormalize(s);
-# #
-# #    var len := s.length();
-# #    var b := bat(str,void,len+4);
-# #    var i := 0;
-# #    var last := " ";
-# #
-# #    b.insert("#",nil);
-# #    b.insert("#",nil);
-# #    while(i < len) {
-# #        b.insert(s.string(i,1), nil);
-# #        if (b.count() > 32764) break; # ignore suffix of >32KB strings
-# #        i :+= 1;
-# #    }
-# #    b.insert("$",nil);
-# #    b.insert("$",nil);
-# #    b.reverse().seqbase(0@0);
-# #
-# #    var c0 := b.select(0@0, oid(b.count() - 3)).mark(0@0).reverse();
-# #    var c1 := b.select(1@0, oid(b.count() - 2)).mark(0@0).reverse();
-# #    var c2 := b.select(2@0, oid(b.count() - 1)).mark(0@0).reverse();
-# #    return [+](c0, [+](c1, c2));
-# #}
-# #mal implementation
-function txtsim.str2qgrams{inline}(s:str):bat[:oid,:str];
-    s := txtsim.qgramnormalize(s);
-
-    l := calc.length(s);
-       len := calc.lng(l);
-    len := calc.+(len,4);
-    b := bat.new(:str,:oid,len);
-    i := 0;
-    last := " ";
-    bat.insert(b,"#",nil:oid);
-    bat.insert(b,"#",nil:oid);
-       cnt:= 0:wrd;
-    id := calc.oid(cnt);
-    barrier B:= calc.<(i,len);
-       sstr := str.string(s,i,1);
-        bat.insert(b,sstr, nil:oid);
-       cnt := aggr.count(b);
-        leave B:= calc.>(cnt,32764); # ignore suffix of >32KB strings
-        i := calc.+(i,1);
-    redo B:= calc.<(i,len);
-    exit B;
-    bat.insert(b,"$",nil:oid);
-    bat.insert(b,"$",nil:oid);
-    rev := bat.reverse (b);
-    alg := algebra.markH(rev, 0@0);
-    cnt := aggr.count(alg);
-    cnt := calc.-(cnt,3);
-    sel := algebra.select(b, 0@0, id);
-    mrk := algebra.markT(sel, 0@0);
-    c0  := bat.reverse(mrk);
-
-    cnt := aggr.count(alg);
-    cnt := calc.-(cnt,2);
-    id := calc.oid(cnt);
-    sel := algebra.select(b, 1@0, id);
-    mrk := algebra.markT(sel, 0@0);
-    c1  := bat.reverse(mrk);
-    cnt := aggr.count(alg);
-    cnt := calc.-(cnt,1);
-    id := calc.oid(cnt);
-    sel := algebra.select(b, 2@0, id);
-    mrk := algebra.markT(sel, 0@0);
-    c2  := bat.reverse(mrk);
-
-    res := batcalc.+(c1, c2);
-    res := batcalc.+(c0, res);
-
-    return str2qgrams := res;
-end str2qgrams;
+command txtsim.str2qgrams(s:str):bat[:oid,:str]
+address CMDstr2qgrams
+comment "Break the string into 4-grams";
_______________________________________________
Checkin-list mailing list
[email protected]
http://mail.monetdb.org/mailman/listinfo/checkin-list

Reply via email to