Changeset: 037d1a2fc232 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=037d1a2fc232
Modified Files:
        monetdb5/modules/mal/Tests/qgram.stable.out
        monetdb5/modules/mal/txtsim.c
Branch: default
Log Message:

Fix txtsim.str2qgrams implementation and approve fixed output.

- allocate enough memory so that the string plus NULL byte actually
  fit;
- free memory afterwards;
- allocate enough memory to hold a 4-gram (i.e. 5 bytes);
- properly close qgram string;
- if we start with the first 2 characters of the string, we should end
  with the last 2;
- check return values.

Note that the code is still incorrect: it should deal properly with
UTF-8 encoded characters.


diffs (81 lines):

diff --git a/monetdb5/modules/mal/Tests/qgram.stable.out 
b/monetdb5/modules/mal/Tests/qgram.stable.out
--- a/monetdb5/modules/mal/Tests/qgram.stable.out
+++ b/monetdb5/modules/mal/Tests/qgram.stable.out
@@ -25,18 +25,19 @@ end main;
 #-------------------------#
 # h    t                 # name
 # void str               # type
-#-------------------------#
-[ 0@0,   "##heÿ\177"     ]
-[ 1@0,   "#helÿ\177"     ]
-[ 2@0,   "hellÿ\177"     ]
-[ 3@0,   "elloÿ\177"     ]
-[ 4@0,   "llo ÿ\177"     ]
-[ 5@0,   "lo wÿ\177"     ]
-[ 6@0,   "o woÿ\177"     ]
-[ 7@0,   " worÿ\177"     ]
-[ 8@0,   "worlÿ\177"     ]
-[ 9@0,   "orldÿ\177"     ]
-[ 10@0,          "rld$ÿ\177"     ]
+#-----------------#
+[ 0@0,   "##he"  ]
+[ 1@0,   "#hel"  ]
+[ 2@0,   "hell"  ]
+[ 3@0,   "ello"  ]
+[ 4@0,   "llo "  ]
+[ 5@0,   "lo w"  ]
+[ 6@0,   "o wo"  ]
+[ 7@0,   " wor"  ]
+[ 8@0,   "worl"  ]
+[ 9@0,   "orld"  ]
+[ 10@0,          "rld$"  ]
+[ 11@0,          "ld$$"  ]
 
 # 23:21:31 >  
 # 23:21:31 >  "Done."
diff --git a/monetdb5/modules/mal/txtsim.c b/monetdb5/modules/mal/txtsim.c
--- a/monetdb5/modules/mal/txtsim.c
+++ b/monetdb5/modules/mal/txtsim.c
@@ -928,22 +928,28 @@ str
 CMDstr2qgrams(int *ret, str *val)
 {
        BAT *bn;
-       int i, len = (int)strlen(*val) +4;
-       str s = GDKzalloc( len);
-       char qgram[4];
+       size_t i, len = strlen(*val) + 5;
+       str s = GDKmalloc(len);
+       char qgram[5];
 
-       s[0]=0;
-       strcat(s,"##");
-       strcat(s,*val);
-       strcat(s,"$$");
-       qgram[3]=0;
-       bn = BATnew(TYPE_void, TYPE_str, (int) strlen(*val));
-       BATseqbase(bn,0);
-       
-       for ( i= 0; i< len -4; i++){
-               strncpy(qgram,s+i,4);
-               BUNappend(bn,qgram,FALSE);
+       if (s == NULL)
+               throw(MAL, "txtsim.str2qgram", MAL_MALLOC_FAIL);
+       strcpy(s, "##");
+       strcpy(s + 2, *val);
+       strcpy(s + len - 3, "$$");
+       qgram[4] = 0;                           /* we're going to deal with 4 
char strings */
+       bn = BATnew(TYPE_void, TYPE_str, (BUN) strlen(*val));
+       if (bn == NULL) {
+               GDKfree(s);
+               throw(MAL, "txtsim.str2qgram", MAL_MALLOC_FAIL);
+       }
+       BATseqbase(bn, 0);
+
+       for (i = 0; i < len - 4; i++){
+               strncpy(qgram, s + i, 4);
+               BUNappend(bn, qgram, FALSE);
        }
        BBPkeepref(*ret = bn->batCacheid);
+       GDKfree(s);
        return MAL_SUCCEED;
 }
_______________________________________________
Checkin-list mailing list
[email protected]
http://mail.monetdb.org/mailman/listinfo/checkin-list

Reply via email to