Changeset: b73fdf565afa for MonetDB URL: https://dev.monetdb.org/hg/MonetDB/rev/b73fdf565afa Modified Files: monetdb5/modules/kernel/batstr.c Branch: txtsim Log Message:
Improve batstr reverse by using fastins instead of BUNappend. diffs (210 lines): diff --git a/monetdb5/modules/kernel/batstr.c b/monetdb5/modules/kernel/batstr.c --- a/monetdb5/modules/kernel/batstr.c +++ b/monetdb5/modules/kernel/batstr.c @@ -1983,7 +1983,7 @@ BATSTRstarts_with_strcst(Client cntxt, M } static str -BATSTRends_with_str_cst(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci) +BATSTRends_with_strcst(Client cntxt, MalBlkPtr mb, MalStkPtr stk, InstrPtr pci) { bit *icase = NULL; switch (pci->argc) { @@ -5243,114 +5243,119 @@ BATSTRasciify(bat *ret, bat *bid) #endif } +static inline void +str_reverse(char *dst, const char *src, size_t len) +{ + dst[len] = 0; + if (strNil(src)) { + assert(len == strlen(str_nil)); + strcpy(dst, str_nil); + return; + } + while (*src) { + if ((*src & 0xF8) == 0xF0) { + /* 4 byte UTF-8 sequence */ + assert(len >= 4); + dst[len - 4] = *src++; + assert((*src & 0xC0) == 0x80); + dst[len - 3] = *src++; + assert((*src & 0xC0) == 0x80); + dst[len - 2] = *src++; + assert((*src & 0xC0) == 0x80); + dst[len - 1] = *src++; + len -= 4; + } else if ((*src & 0xF0) == 0xE0) { + /* 3 byte UTF-8 sequence */ + assert(len >= 3); + dst[len - 3] = *src++; + assert((*src & 0xC0) == 0x80); + dst[len - 2] = *src++; + assert((*src & 0xC0) == 0x80); + dst[len - 1] = *src++; + len -= 3; + } else if ((*src & 0xE0) == 0xC0) { + /* 2 byte UTF-8 sequence */ + assert(len >= 2); + dst[len - 2] = *src++; + assert((*src & 0xC0) == 0x80); + dst[len - 1] = *src++; + len -= 2; + } else { + /* 1 byte UTF-8 "sequence" */ + assert(len >= 1); + assert((*src & 0x80) == 0); + dst[--len] = *src++; + } + } + assert(len == 0); +} + static str BATSTRreverse(bat *ret, const bat *arg) { BAT *b, *bn; BATiter bi; - BUN start, end; + BUN p, q; const char *src; - /* Allocate temporary space for reversed strings; - we grow this if we need more. */ size_t len, dst_len = 1024; - int i = -1; - str dst, error[2] = { GDK_EXCEPTION, MAL_MALLOC_FAIL }; - /* Use zalloc to force valid UTF-8 */ + str dst, msg = MAL_SUCCEED; + bool nils = false; + if ((dst = GDKzalloc(dst_len)) == NULL) throw(MAL, "batstr.reverse", MAL_MALLOC_FAIL); if ((b = BATdescriptor(*arg)) == NULL) { GDKfree(dst); throw(MAL, "batstr.reverse", RUNTIME_OBJECT_MISSING); } - /* We should only get called for string BATs */ assert(b->ttype == TYPE_str); - /* Allocate result BAT */ bn = COLnew(b->hseqbase, TYPE_str, BATcount(b), TRANSIENT); if(bn == NULL) { BBPunfix(b->batCacheid); GDKfree(dst); throw(MAL, "batstr.reverse", MAL_MALLOC_FAIL); } - /* Loop through BAT b; 'start' is index of the entry we're working - on, 'end' is used internally by BATloop to do the iterating */ bi = bat_iterator(b); - BATloop(b, start, end) { - src = (const char *) BUNtail(bi, start); + BATloop(b, p, q) { + src = (const char *) BUNtail(bi, p); if (strNil(src)) { assert(len > strlen(src)); + nils = true; strcpy(dst, str_nil); } else { len = strlen(src); - /* make sure dst is large enough */ if (len >= dst_len) { dst_len = len + 1024; if ((dst = GDKrealloc(dst, dst_len)) == NULL) { - i = 1; + msg = createException(MAL,"batstr.reverse", MAL_MALLOC_FAIL); goto bail; } } - /* All strings in MonetDB are encoded using UTF-8; we must - * make sure that the reversed string is also encoded in valid - * UTF-8, so we treat multibyte characters as single units */ - while (*src) { - if ((*src & 0xF8) == 0xF0) { - /* 4 byte UTF-8 sequence */ - assert(len >= 4); - dst[len - 4] = *src++; - assert((*src & 0xC0) == 0x80); - dst[len - 3] = *src++; - assert((*src & 0xC0) == 0x80); - dst[len - 2] = *src++; - assert((*src & 0xC0) == 0x80); - dst[len - 1] = *src++; - len -= 4; - } else if ((*src & 0xF0) == 0xE0) { - /* 3 byte UTF-8 sequence */ - assert(len >= 3); - dst[len - 3] = *src++; - assert((*src & 0xC0) == 0x80); - dst[len - 2] = *src++; - assert((*src & 0xC0) == 0x80); - dst[len - 1] = *src++; - len -= 3; - } else if ((*src & 0xE0) == 0xC0) { - /* 2 byte UTF-8 sequence */ - assert(len >= 2); - dst[len - 2] = *src++; - assert((*src & 0xC0) == 0x80); - dst[len - 1] = *src++; - len -= 2; - } else { - /* 1 byte UTF-8 "sequence" */ - assert(len >= 1); - assert((*src & 0x80) == 0); - dst[--len] = *src++; - } - } - assert(len == 0); + str_reverse(dst, src, len); } - if (BUNappend(bn, dst, false) != GDK_SUCCEED) { - /* BUNappend can fail since it may have to grow memory - areas, especially true for string BATs */ - i = 0; + if (tfastins_nocheckVAR(bn, p, dst) != GDK_SUCCEED) { + msg = createException(MAL,"batstr.reverse", GDK_EXCEPTION); goto bail; } } bat_iterator_end(&bi); + BATsetcount(bn, q); + bn->theap->dirty |= BATcount(bn) > 0; + bn->tnil = nils; + bn->tnonil = !nils; + bn->tkey = BATcount(bn) <= 1; + bn->tsorted = BATcount(bn) <= 1; + bn->trevsorted = BATcount(bn) <= 1; GDKfree(dst); BBPunfix(b->batCacheid); *ret = bn->batCacheid; BBPkeepref(bn); - return MAL_SUCCEED; + return msg; bail: - /* We only get here in the case of an allocation error; - clean up the mess we've created and throw an exception */ bat_iterator_end(&bi); GDKfree(dst); - BBPunfix(b->batCacheid); - BBPunfix(bn->batCacheid); - throw(MAL, "batstr.reverse", "%s", error[i]); + unfix_inputs(b->batCacheid, bn->batCacheid); + throw(MAL, "batstr.reverse", "%s", msg); } #include "mel.h" @@ -5427,10 +5432,10 @@ mel_func batstr_init_funcs[] = { pattern("batstr", "endsWith", BATSTRends_with_cst, false, "Check if bat string ends with substring, icase flag.", args(1,4, batarg("",bit),batarg("s",str),arg("prefix",str),arg("icase",bit))), pattern("batstr", "endsWith", BATSTRends_with_cst, false, "Check if bat string(with CL) ends with substring.", args(1,4, batarg("",bit),batarg("s",str),arg("prefix",str),batarg("s",oid))), pattern("batstr", "endsWith", BATSTRends_with_cst, false, "Check if bat string(with CL) ends with substring + icase flag.", args(1,5, batarg("",bit),batarg("s",str),arg("prefix",str),batarg("s",oid),arg("icase",bit))), - pattern("batstr", "endsWith", BATSTRends_with_str_cst, false, "Check if string ends with bat substring.", args(1,3, batarg("",bit),arg("s",str),batarg("prefix",str))), - pattern("batstr", "endsWith", BATSTRends_with_str_cst, false, "Check if string ends with bat substring + icase flag.", args(1,4, batarg("",bit),arg("s",str),batarg("prefix",str),arg("icase",bit))), - pattern("batstr", "endsWith", BATSTRends_with_str_cst, false, "Check if string ends with bat substring(with CL).", args(1,4, batarg("",bit),arg("s",str),batarg("prefix",str),batarg("s",oid))), - pattern("batstr", "endsWith", BATSTRends_with_str_cst, false, "Check if string ends with bat substring(with CL) + icase flag.", args(1,5, batarg("",bit),arg("s",str),batarg("prefix",str),batarg("s",oid),arg("icase",bit))), + pattern("batstr", "endsWith", BATSTRends_with_strcst, false, "Check if string ends with bat substring.", args(1,3, batarg("",bit),arg("s",str),batarg("prefix",str))), + pattern("batstr", "endsWith", BATSTRends_with_strcst, false, "Check if string ends with bat substring + icase flag.", args(1,4, batarg("",bit),arg("s",str),batarg("prefix",str),arg("icase",bit))), + pattern("batstr", "endsWith", BATSTRends_with_strcst, false, "Check if string ends with bat substring(with CL).", args(1,4, batarg("",bit),arg("s",str),batarg("prefix",str),batarg("s",oid))), + pattern("batstr", "endsWith", BATSTRends_with_strcst, false, "Check if string ends with bat substring(with CL) + icase flag.", args(1,5, batarg("",bit),arg("s",str),batarg("prefix",str),batarg("s",oid),arg("icase",bit))), pattern("batstr", "splitpart", STRbatsplitpart, false, "Split string on delimiter. Returns\ngiven field (counting from one.)", args(1,4, batarg("",str),batarg("s",str),batarg("needle",str),batarg("field",int))), pattern("batstr", "splitpart", STRbatsplitpartcst, false, "Split string on delimiter. Returns\ngiven field (counting from one.)", args(1,4, batarg("",str),batarg("s",str),arg("needle",str),arg("field",int))), pattern("batstr", "splitpart", STRbatsplitpart_needlecst, false, "Split string on delimiter. Returns\ngiven field (counting from one.)", args(1,4, batarg("",str),batarg("s",str),arg("needle",str),batarg("field",int))), _______________________________________________ checkin-list mailing list -- checkin-list@monetdb.org To unsubscribe send an email to checkin-list-le...@monetdb.org