Changeset: 037d1a2fc232 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=037d1a2fc232
Modified Files:
monetdb5/modules/mal/Tests/qgram.stable.out
monetdb5/modules/mal/txtsim.c
Branch: default
Log Message:
Fix txtsim.str2qgrams implementation and approve fixed output.
- allocate enough memory so that the string plus NULL byte actually
fit;
- free memory afterwards;
- allocate enough memory to hold a 4-gram (i.e. 5 bytes);
- properly close qgram string;
- if we start with the first 2 characters of the string, we should end
with the last 2;
- check return values.
Note that the code is still incorrect: it should deal properly with
UTF-8 encoded characters.
diffs (81 lines):
diff --git a/monetdb5/modules/mal/Tests/qgram.stable.out
b/monetdb5/modules/mal/Tests/qgram.stable.out
--- a/monetdb5/modules/mal/Tests/qgram.stable.out
+++ b/monetdb5/modules/mal/Tests/qgram.stable.out
@@ -25,18 +25,19 @@ end main;
#-------------------------#
# h t # name
# void str # type
-#-------------------------#
-[ 0@0, "##heÿ\177" ]
-[ 1@0, "#helÿ\177" ]
-[ 2@0, "hellÿ\177" ]
-[ 3@0, "elloÿ\177" ]
-[ 4@0, "llo ÿ\177" ]
-[ 5@0, "lo wÿ\177" ]
-[ 6@0, "o woÿ\177" ]
-[ 7@0, " worÿ\177" ]
-[ 8@0, "worlÿ\177" ]
-[ 9@0, "orldÿ\177" ]
-[ 10@0, "rld$ÿ\177" ]
+#-----------------#
+[ 0@0, "##he" ]
+[ 1@0, "#hel" ]
+[ 2@0, "hell" ]
+[ 3@0, "ello" ]
+[ 4@0, "llo " ]
+[ 5@0, "lo w" ]
+[ 6@0, "o wo" ]
+[ 7@0, " wor" ]
+[ 8@0, "worl" ]
+[ 9@0, "orld" ]
+[ 10@0, "rld$" ]
+[ 11@0, "ld$$" ]
# 23:21:31 >
# 23:21:31 > "Done."
diff --git a/monetdb5/modules/mal/txtsim.c b/monetdb5/modules/mal/txtsim.c
--- a/monetdb5/modules/mal/txtsim.c
+++ b/monetdb5/modules/mal/txtsim.c
@@ -928,22 +928,28 @@ str
CMDstr2qgrams(int *ret, str *val)
{
BAT *bn;
- int i, len = (int)strlen(*val) +4;
- str s = GDKzalloc( len);
- char qgram[4];
+ size_t i, len = strlen(*val) + 5;
+ str s = GDKmalloc(len);
+ char qgram[5];
- s[0]=0;
- strcat(s,"##");
- strcat(s,*val);
- strcat(s,"$$");
- qgram[3]=0;
- bn = BATnew(TYPE_void, TYPE_str, (int) strlen(*val));
- BATseqbase(bn,0);
-
- for ( i= 0; i< len -4; i++){
- strncpy(qgram,s+i,4);
- BUNappend(bn,qgram,FALSE);
+ if (s == NULL)
+ throw(MAL, "txtsim.str2qgram", MAL_MALLOC_FAIL);
+ strcpy(s, "##");
+ strcpy(s + 2, *val);
+ strcpy(s + len - 3, "$$");
+ qgram[4] = 0; /* we're going to deal with 4
char strings */
+ bn = BATnew(TYPE_void, TYPE_str, (BUN) strlen(*val));
+ if (bn == NULL) {
+ GDKfree(s);
+ throw(MAL, "txtsim.str2qgram", MAL_MALLOC_FAIL);
+ }
+ BATseqbase(bn, 0);
+
+ for (i = 0; i < len - 4; i++){
+ strncpy(qgram, s + i, 4);
+ BUNappend(bn, qgram, FALSE);
}
BBPkeepref(*ret = bn->batCacheid);
+ GDKfree(s);
return MAL_SUCCEED;
}
_______________________________________________
Checkin-list mailing list
[email protected]
http://mail.monetdb.org/mailman/listinfo/checkin-list