Changeset: fd84e2108726 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=fd84e2108726 Modified Files: gdk/ChangeLog gdk/gdk_bat.c Branch: default Log Message:
Extend BATattach to also accept the str type using null-terminated strings. diffs (201 lines): diff --git a/gdk/ChangeLog b/gdk/ChangeLog --- a/gdk/ChangeLog +++ b/gdk/ChangeLog @@ -2,6 +2,8 @@ # This file is updated with Maddlog * Tue Jul 26 2016 Sjoerd Mullender <[email protected]> +- BATattach now can also create a str BAT from a file consisting of + null-terminated strings. The input file must be encoded using UTF-8. - BATattach now copies the input file instead of "stealing" it. - Removed the lastused "timestamp" from the BBP. - Removed batStamp field from BAT descriptor, and removed the BBPcurstamp diff --git a/gdk/gdk_bat.c b/gdk/gdk_bat.c --- a/gdk/gdk_bat.c +++ b/gdk/gdk_bat.c @@ -247,63 +247,143 @@ BAT * BATattach(int tt, const char *heapfile, int role) { BAT *bn; - struct stat st; - int atomsize; - BUN cap; char *p; - off_t n; size_t m; FILE *f; ERRORcheck(tt <= 0 , "BATattach: bad tail type (<=0)\n", NULL); - ERRORcheck(ATOMvarsized(tt), "BATattach: bad tail type (varsized)\n", NULL); - ERRORcheck(heapfile == 0, "BATattach: bad heapfile name\n", NULL); + ERRORcheck(ATOMvarsized(tt) && ATOMstorage(tt) != TYPE_str, "BATattach: bad tail type (varsized and not str)\n", NULL); + ERRORcheck(heapfile == NULL, "BATattach: bad heapfile name\n", NULL); ERRORcheck(role < 0 || role >= 32, "BATattach: role error\n", NULL); if ((f = fopen(heapfile, "rb")) == NULL) { GDKsyserror("BATattach: cannot open %s\n", heapfile); return NULL; } - if (fstat(fileno(f), &st) < 0) { - GDKsyserror("BATattach: cannot stat %s\n", heapfile); + if (ATOMstorage(tt) == TYPE_str) { + size_t n; + char *s; + int c, u; + + if ((bn = COLnew(0, tt, 0, role)) == NULL) { + fclose(f); + return NULL; + } + m = 4096; + n = 0; + u = 0; + s = p = GDKmalloc(m); + if (p == NULL) { + fclose(f); + BBPreclaim(bn); + return NULL; + } + while ((c = getc(f)) != EOF) { + if (n == m) { + m += 4096; + p = GDKrealloc(p, m); + s = p + n; + } + if (c == '\n' && n > 0 && s[-1] == '\r') { + /* deal with CR-LF sequence */ + s[-1] = c; + } else { + *s++ = c; + n++; + } + if (u) { + if ((c & 0xC0) == 0x80) + u--; + else + goto notutf8; + } else if ((c & 0xF8) == 0xF0) + u = 3; + else if ((c & 0xF0) == 0xE0) + u = 2; + else if ((c & 0xE0) == 0xC0) + u = 1; + else if ((c & 0x80) == 0x80) + goto notutf8; + else if (c == 0) { + if (BUNappend(bn, p, 0) != GDK_SUCCEED) { + BBPreclaim(bn); + fclose(f); + GDKfree(p); + return NULL; + } + s = p; + n = 0; + } + } fclose(f); - return NULL; - } - ERRORcheck(!S_ISREG(st.st_mode), "BATattach: heapfile must be a regular file\n", NULL); - atomsize = ATOMsize(tt); - ERRORcheck(st.st_size % atomsize != 0, "BATattach: heapfile size not integral number of atoms\n", NULL); - ERRORcheck((size_t) (st.st_size / atomsize) > (size_t) BUN_MAX, "BATattach: heapfile too large\n", NULL); - cap = (BUN) (st.st_size / atomsize); - bn = COLnew(0, tt, cap, role); - if (bn == NULL) - return NULL; - p = Tloc(bn, 0); - n = st.st_size; - while (n > 0 && (m = fread(p, 1, (size_t) MIN(1024*1024, n), f)) > 0) { - p += m; - n -= m; - } - fclose(f); - if (n > 0) { - GDKerror("BATattach: couldn't read the complete file\n"); - BBPreclaim(bn); - return NULL; - } - BATsetcount(bn, cap); + GDKfree(p); + if (n > 0) { + BBPreclaim(bn); + GDKerror("BATattach: last string is not null-terminated\n"); + return NULL; + } + } else { + struct stat st; + int atomsize; + BUN cap; + off_t n; - bn->tnonil = cap == 0; - bn->tnil = 0; - bn->tdense = 0; - if (cap > 1) { - bn->tsorted = 0; - bn->trevsorted = 0; - bn->tkey = 0; - } else { - bn->tsorted = 1; - bn->trevsorted = 1; - bn->tkey = 1; + if (fstat(fileno(f), &st) < 0) { + GDKsyserror("BATattach: cannot stat %s\n", heapfile); + fclose(f); + return NULL; + } + atomsize = ATOMsize(tt); + if (st.st_size % atomsize != 0) { + fclose(f); + GDKerror("BATattach: heapfile size not integral number of atoms\n"); + return NULL; + } + if ((size_t) (st.st_size / atomsize) > (size_t) BUN_MAX) { + fclose(f); + GDKerror("BATattach: heapfile too large\n"); + return NULL; + } + cap = (BUN) (st.st_size / atomsize); + bn = COLnew(0, tt, cap, role); + if (bn == NULL) { + fclose(f); + return NULL; + } + p = Tloc(bn, 0); + n = st.st_size; + while (n > 0 && (m = fread(p, 1, (size_t) MIN(1024*1024, n), f)) > 0) { + p += m; + n -= m; + } + fclose(f); + if (n > 0) { + GDKerror("BATattach: couldn't read the complete file\n"); + BBPreclaim(bn); + return NULL; + } + BATsetcount(bn, cap); + bn->tnonil = cap == 0; + bn->tnil = 0; + bn->tdense = 0; + if (cap > 1) { + bn->tsorted = 0; + bn->trevsorted = 0; + bn->tkey = 0; + } else { + bn->tsorted = 1; + bn->trevsorted = 1; + bn->tkey = 1; + } } return bn; + + notutf8: + fclose(f); + BBPreclaim(bn); + GDKfree(p); + GDKerror("BATattach: input is not UTF-8\n"); + return NULL; } /* _______________________________________________ checkin-list mailing list [email protected] https://www.monetdb.org/mailman/listinfo/checkin-list
