Changeset: 30d504324760 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=30d504324760
Added Files:
        gdk/gdk_string.c
Modified Files:
        gdk/Makefile.ag
        gdk/gdk_atoms.c
        gdk/gdk_private.h
Branch: default
Log Message:

Moved string atom implementation to a separate file.


diffs (truncated from 1632 to 300 lines):

diff --git a/gdk/Makefile.ag b/gdk/Makefile.ag
--- a/gdk/Makefile.ag
+++ b/gdk/Makefile.ag
@@ -21,7 +21,7 @@ lib_gdk = {
                gdk_orderidx.c \
                gdk_align.c gdk_bbp.c gdk_bbp.h \
                gdk_heap.c gdk_utils.c gdk_utils.h \
-               gdk_atoms.c gdk_atoms.h \
+               gdk_atoms.c gdk_atoms.h gdk_string.c \
                gdk_qsort.c gdk_qsort_impl.h \
                gdk_storage.c gdk_bat.c \
                gdk_delta.c gdk_cross.c gdk_system.c gdk_value.c \
diff --git a/gdk/gdk_atoms.c b/gdk/gdk_atoms.c
--- a/gdk/gdk_atoms.c
+++ b/gdk/gdk_atoms.c
@@ -258,7 +258,6 @@ const lng lng_nil = GDK_lng_min-1;
 const hge hge_nil = GDK_hge_min-1;
 #endif
 const oid oid_nil = (oid) 1 << (sizeof(oid) * 8 - 1);
-const char str_nil[2] = { '\200', 0 };
 const ptr ptr_nil = NULL;
 
 ptr
@@ -407,13 +406,11 @@ TYPE##ToStr(char **dst, size_t *len, con
        return snprintf(*dst, *len, FMT, FMTCAST *src); \
 }
 
-#define num08(x)       ((x) >= '0' && (x) <= '7')
 #define num10(x)       GDKisdigit(x)
-#define num16(x)       isxdigit((unsigned char) (x))
 #define base10(x)      ((x) - '0')
-#define base08(x)      ((x) - '0')
+
+#define num16(x)       isxdigit((unsigned char) (x))
 #define base16(x)      (((x) >= 'a' && (x) <= 'f') ? ((x) - 'a' + 10) : ((x) 
>= 'A' && (x) <= 'F') ? ((x) - 'A' + 10) : (x) - '0')
-#define mult08(x)      ((x) << 3)
 #define mult16(x)      ((x) << 4)
 
 static void *
@@ -1056,746 +1053,6 @@ fltToStr(char **dst, size_t *len, const 
 atom_io(flt, Int, int)
 
 
-/* String Atom Implementation
- *
- * Strings are stored in two parts.  The first part is the normal tail
- * heap which contains a list of offsets.  The second part is the
- * theap which contains the actual strings.  The offsets in the tail
- * heap (a.k.a. offset heap) point into the theap (a.k.a. string
- * heap).  Strings are NULL-terminated and are stored without any
- * escape sequec=nces.  Strings are encoded using the UTF-8 encoding
- * of Unicode.  This means that individual "characters" (really,
- * Unicode code points) can be between one and four bytes long.
- *
- * Because in many typical situations there are lots of duplicated
- * string values that are being stored in a table, but also in many
- * (other) typical situations there are very few duplicated string
- * values stored, a scheme has been introduced to cater to both
- * situations.
- *
- * When the string heap is "small" (defined as less than 64KiB), the
- * string heap is fully duplicate eliminated.  When the string heap
- * grows beyond this size, the heap is not kept free of duplicate
- * strings, but there is then a heuristic that tries to limit the
- * number of duplicates.
- *
- * This is done by having a fixed sized hash table at the start of the
- * string heap, and allocating space for collision lists in the first
- * 64KiB of the string heap.  After the first 64KiB no extra space is
- * allocated for lists, so hash collisions cannot be resolved.
- */
-
-int
-strNil(const char *s)
-{
-       return GDK_STRNIL(s);
-}
-
-size_t
-strLen(const char *s)
-{
-       return GDK_STRLEN(s);
-}
-
-static int
-strCmp(const char *l, const char *r)
-{
-       return GDK_STRCMP(l, r);
-}
-
-int
-strCmpNoNil(const unsigned char *l, const unsigned char *r)
-{
-       while (*l == *r) {
-               if (*l == 0)
-                       return 0;
-               l++;
-               r++;
-       }
-       return (*l < *r) ? -1 : 1;
-}
-
-static void
-strHeap(Heap *d, size_t cap)
-{
-       size_t size;
-
-       cap = MAX(cap, BATTINY);
-       size = GDK_STRHASHTABLE * sizeof(stridx_t) + MIN(GDK_ELIMLIMIT, cap * 
GDK_VARALIGN);
-       if (HEAPalloc(d, size, 1) == GDK_SUCCEED) {
-               d->free = GDK_STRHASHTABLE * sizeof(stridx_t);
-               d->dirty = 1;
-               memset(d->base, 0, d->free);
-               d->hashash = 0;
-#ifndef NDEBUG
-               /* fill should solve initialization problems within valgrind */
-               memset(d->base + d->free, 0, d->size - d->free);
-#endif
-       }
-}
-
-
-BUN
-strHash(const char *s)
-{
-       BUN res;
-
-       GDK_STRHASH(s, res);
-       return res;
-}
-
-void
-strCleanHash(Heap *h, bool rebuild)
-{
-       stridx_t newhash[GDK_STRHASHTABLE];
-       size_t pad, pos;
-       const size_t extralen = h->hashash ? EXTRALEN : 0;
-       BUN off, strhash;
-       const char *s;
-
-       (void) rebuild;
-       if (!h->cleanhash)
-               return;
-       /* rebuild hash table for double elimination
-        *
-        * If appending strings to the BAT was aborted, if the heap
-        * was memory mapped, the hash in the string heap may well be
-        * incorrect.  Therefore we don't trust it when we read in a
-        * string heap and we rebuild the complete table (it is small,
-        * so this won't take any time at all).
-        * Note that we will only do this the first time the heap is
-        * loaded, and only for heaps that existed when the server was
-        * started. */
-       memset(newhash, 0, sizeof(newhash));
-       pos = GDK_STRHASHSIZE;
-       while (pos < h->free && pos < GDK_ELIMLIMIT) {
-               pad = GDK_VARALIGN - (pos & (GDK_VARALIGN - 1));
-               if (pad < sizeof(stridx_t))
-                       pad += GDK_VARALIGN;
-               pos += pad + extralen;
-               s = h->base + pos;
-               if (h->hashash)
-                       strhash = ((const BUN *) s)[-1];
-               else
-                       GDK_STRHASH(s, strhash);
-               off = strhash & GDK_STRHASHMASK;
-               newhash[off] = (stridx_t) (pos - extralen - sizeof(stridx_t));
-               pos += GDK_STRLEN(s);
-       }
-       /* only set dirty flag if the hash table actually changed */
-       if (memcmp(newhash, h->base, sizeof(newhash)) != 0) {
-               memcpy(h->base, newhash, sizeof(newhash));
-               if (h->storage == STORE_MMAP) {
-                       if (!(GDKdebug & NOSYNCMASK))
-                               (void) MT_msync(h->base, GDK_STRHASHSIZE);
-               } else
-                       h->dirty = 1;
-       }
-#ifndef NDEBUG
-       if (GDK_ELIMDOUBLES(h)) {
-               pos = GDK_STRHASHSIZE;
-               while (pos < h->free) {
-                       pad = GDK_VARALIGN - (pos & (GDK_VARALIGN - 1));
-                       if (pad < sizeof(stridx_t))
-                               pad += GDK_VARALIGN;
-                       pos += pad + extralen;
-                       s = h->base + pos;
-                       assert(strLocate(h, s) != 0);
-                       pos += GDK_STRLEN(s);
-               }
-       }
-#endif
-       h->cleanhash = 0;
-}
-
-/*
- * The strPut routine. The routine strLocate can be used to identify
- * the location of a string in the heap if it exists. Otherwise it
- * returns zero.
- */
-var_t
-strLocate(Heap *h, const char *v)
-{
-       stridx_t *ref, *next;
-       const size_t extralen = h->hashash ? EXTRALEN : 0;
-
-       /* search hash-table, if double-elimination is still in place */
-       BUN off;
-       GDK_STRHASH(v, off);
-       off &= GDK_STRHASHMASK;
-
-       /* should only use strLocate iff fully double eliminated */
-       assert(GDK_ELIMBASE(h->free) == 0);
-
-       /* search the linked list */
-       for (ref = ((stridx_t *) h->base) + off; *ref; ref = next) {
-               next = (stridx_t *) (h->base + *ref);
-               if (GDK_STRCMP(v, (str) (next + 1) + extralen) == 0)
-                       return (var_t) ((sizeof(stridx_t) + *ref + extralen));  
/* found */
-       }
-       return 0;
-}
-
-static var_t
-strPut(Heap *h, var_t *dst, const char *v)
-{
-       size_t elimbase = GDK_ELIMBASE(h->free);
-       size_t pad;
-       size_t pos, len = GDK_STRLEN(v);
-       const size_t extralen = h->hashash ? EXTRALEN : 0;
-       stridx_t *bucket;
-       BUN off, strhash;
-
-       GDK_STRHASH(v, off);
-       strhash = off;
-       off &= GDK_STRHASHMASK;
-       bucket = ((stridx_t *) h->base) + off;
-
-       if (*bucket) {
-               /* the hash list is not empty */
-               if (*bucket < GDK_ELIMLIMIT) {
-                       /* small string heap (<64KiB) -- fully double
-                        * eliminated: search the linked list */
-                       const stridx_t *ref = bucket;
-
-                       do {
-                               pos = *ref + sizeof(stridx_t) + extralen;
-                               if (GDK_STRCMP(v, h->base + pos) == 0) {
-                                       /* found */
-                                       return *dst = (var_t) pos;
-                               }
-                               ref = (stridx_t *) (h->base + *ref);
-                       } while (*ref);
-               } else {
-                       /* large string heap (>=64KiB) -- there is no
-                        * linked list, so only look at single
-                        * entry */
-                       pos = *bucket + extralen;
-                       if (GDK_STRCMP(v, h->base + pos) == 0) {
-                               /* already in heap: reuse */
-                               return *dst = (var_t) pos;
-                       }
-               }
-       }
-       /* the string was not found in the heap, we need to enter it */
-
-       pad = GDK_VARALIGN - (h->free & (GDK_VARALIGN - 1));
-       if (elimbase == 0) {    /* i.e. h->free < GDK_ELIMLIMIT */
-               if (pad < sizeof(stridx_t)) {
-                       /* make room for hash link */
-                       pad += GDK_VARALIGN;
-               }
-       } else if (extralen == 0) { /* i.e., h->hashash == FALSE */
-               /* no VARSHIFT and no string hash value stored => no
-                * padding/alignment needed */
-               pad = 0;
-       } else {
-               /* pad to align on VARALIGN for VARSHIFT and/or string
-                * hash value */
-               pad &= (GDK_VARALIGN - 1);
-       }
-
-       /* check heap for space (limited to a certain maximum after
-        * which nils are inserted) */
-       if (h->free + pad + len + extralen >= h->size) {
-               size_t newsize = MAX(h->size, 4096);
-
-               /* double the heap size until we have enough space */
-               do {
-                       if (newsize < 4 * 1024 * 1024)
-                               newsize <<= 1;
-                       else
-                               newsize += 4 * 1024 * 1024;
-               } while (newsize <= h->free + pad + len + extralen);
-
-               assert(newsize);
-
-               if (h->free + pad + len + extralen >= (size_t) VAR_MAX) {
-                       GDKerror("strPut: string heaps gets larger than 
%zuGiB.\n", (size_t) VAR_MAX >> 30);
-                       return 0;
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to