Changeset: fbcd6ce89476 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=fbcd6ce89476
Modified Files:
gdk/gdk_strimps.c
Branch: string_imprints
Log Message:
Count byte pairs instead of unicode character pairs
diffs (83 lines):
diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -13,33 +13,33 @@
/* This counts how many unicode codepoints the given string
* contains.
*/
-static size_t
-GDKstrimp_strlen(const uint8_t *s)
-{
- size_t ret = 0;
- size_t i;
- int m,n;
- uint8_t c;
+/* static size_t */
+/* GDKstrimp_strlen(const uint8_t *s) */
+/* { */
+/* size_t ret = 0; */
+/* size_t i; */
+/* int m,n; */
+/* uint8_t c; */
- i = 0;
- while((c = *(s + i)) != 0) {
- if (c < 0x80)
- i++;
- else {
- for (n = 0, m=0x40; c & m; n++, m >>= 1)
- ;
- /* n is now the number of 10xxxxxx bytes that should
- follow. */
- if (n == 0 || n >= 4)
- /* TODO: handle invalid utf-8 */
- {}
- i += n+1;
- }
- ret++;
- }
+/* i = 0; */
+/* while((c = *(s + i)) != 0) { */
+/* if (c < 0x80) */
+/* i++; */
+/* else { */
+/* for (n = 0, m=0x40; c & m; n++, m >>= 1) */
+/* ; */
+/* /\* n is now the number of 10xxxxxx bytes that should */
+/* follow. *\/ */
+/* if (n == 0 || n >= 4) */
+/* /\* TODO: handle invalid utf-8 *\/ */
+/* {} */
+/* i += n+1; */
+/* } */
+/* ret++; */
+/* } */
- return ret;
-}
+/* return ret; */
+/* } */
/* Given a BAT return the number of digrams in it. The observation is
* that the number of digrams is the number of characters - 1:
@@ -55,7 +55,7 @@ GDKstrimp_ndigrams(BAT *b, size_t *n)
// lng t0;
BUN i;
BATiter bi;
- uint8_t *s;
+ char *s;
// GDKtracer_set_component_level("ALGO", "DEBUG");
// struct canditer ci;
@@ -66,8 +66,9 @@ GDKstrimp_ndigrams(BAT *b, size_t *n)
bi = bat_iterator(b);
*n = 0;
for (i = 0; i < b->batCount; i++) {
- s = (uint8_t *)BUNtail(bi, i);
- *n += GDKstrimp_strlen(s) - 1;
+ s = (char *)BUNtail(bi, i);
+ // *n += GDKstrimp_strlen(s) - 1;
+ *n += strlen(s) - 1;
// TRC_DEBUG(ALGO, "s["LLFMT"]=%s\n", i, s);
}
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list