Changeset: 3977ea91b871 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/3977ea91b871
Added Files:
        documentation/source/bincopy-backref.rst
Modified Files:
        documentation/index.rst
        sql/backends/monet5/sql_bincopyconvert.c
        sql/backends/monet5/sql_bincopyconvert.h
Branch: copyfaster
Log Message:

Add backref encoding for strings in COPY BINARY


diffs (truncated from 363 to 300 lines):

diff --git a/documentation/index.rst b/documentation/index.rst
--- a/documentation/index.rst
+++ b/documentation/index.rst
@@ -27,6 +27,7 @@ Welcome to MonetDB's documentation!
    source/design
    source/input
    source/json
+   source/bincopy-backref
    source/cmake
    source/release
    source/developers_handbook
diff --git a/documentation/source/bincopy-backref.rst 
b/documentation/source/bincopy-backref.rst
new file mode 100644
--- /dev/null
+++ b/documentation/source/bincopy-backref.rst
@@ -0,0 +1,91 @@
+*******************************************
+Backref encoding for strings in COPY BINARY
+*******************************************
+
+With COPY BINARY, a string column is encoded as the concatenation of the
+NUL-terminated strings. Nil is encoded as ``80 00``, that is,
+byte value 0x80=128 followed by byte value 0. This is safe because
+valid UTF-8 encodings never start with bytes in the ranges
+0x80..0xBF or 0xF8..0xFF.
+
+We have now extended the file format with a way to reuse earlier
+entries in the file. This is useful for two reasons
+
+1. It saves space because the backreference is almost always shorter
+   than the string would have been
+
+2. It saves time because inserting a string value into a BAT takes quite
+   some effort compared to inserting for example an integer, but reusing
+   a string that has been inserted before is about as cheap as inserting
+   an integer.
+
+The backreferences are always relative to the the current end of the
+BAT, so the most recently inserted entry is referred to as 1, the one
+before as 2, etc. We'll write this number as N. Note that N is never 0.
+
+If N<=63, the backreference is encoded as the single byte 0x80+N.
+It is not followed by a 0x00.
+
+If N >= 64, N is written as N = N0 + 128*N1 + 128^2*N2 + .. +128^i*Ni .
+The backreference is then encoded as::
+
+    0x80 0x80+N0 .. 0x80+N(i-1) + Ni
+
+For example, N = 500 = 116 + 3 * 128 is encoded as::
+
+    0x80 0xF4 0x03
+
+Again there is no trailing 00.
+
+Sample Python code::
+
+    def backref_encode(strings_or_nones):
+        memory = {}
+        prev_nil = -64
+        for i, s in enumerate(strings_or_nones):
+            if s is not None:
+                prev = memory.get(s)
+                memory[s] = i  # always prefer latest
+                if prev is None:
+                    # nul-terminated string
+                    yield bytes(s, 'utf-8') + b'\x00'
+                else:
+                    delta = i - prev
+                    if delta < 64:
+                        # short backref
+                        yield bytes([0x80 + delta])
+                    else:
+                        # long backref
+                        buf = bytearray([0x80])
+                        while delta > 0:
+                            chunk = delta % 128
+                            if chunk < delta:
+                                chunk += 0x80  # not the last
+                            buf.append(chunk)
+                            delta = delta // 128
+                        yield buf
+            else:
+                delta = i - prev_nil
+                prev_nil = i
+                if delta < 64:
+                    # short backref to earlier nil
+                    yield bytes([0x80 + delta])
+                else:
+                    # no space advantage, just encode the nil
+                    yield b'\x80\x00'
+
+
+    input = ['foo', None, None, 'foo']
+    expected = b'foo\x00' + b'\x80\x00' + b'\x81' + b'\x83'
+    actual = b''.join(backref_encode(input))
+    assert actual == expected
+
+    input = ['foo', None] + 500 * ['bar'] + ['foo', None]
+    expected = (
+        (b'foo\x00' + b'\x80\x00')
+        + (b'bar\x00' + 499 * b'\x81')
+        # 502 == 118 + 3 * 128; 118 + 128 == 0xF6
+        + (b'\x80\xF6\x03' + b'\x80\x00')
+    )
+    actual = b''.join(backref_encode(input))
+    assert actual == expected
diff --git a/sql/backends/monet5/sql_bincopyconvert.c 
b/sql/backends/monet5/sql_bincopyconvert.c
--- a/sql/backends/monet5/sql_bincopyconvert.c
+++ b/sql/backends/monet5/sql_bincopyconvert.c
@@ -330,12 +330,12 @@ encode_timestamp(void *dst_, void *src_,
 
 void init_insert_state(struct insert_state *st, allocator *ma, BAT *bat, int 
width) {
        *st = (struct insert_state) {
+               .ma = ma,
                .bat = bat,
                .width = width,
                .scratch = NULL,
                .schratch_len = 0,
-               .left_over = 0,
-               .ma = ma,
+               .resume = 0,
        };
 };
 
@@ -346,65 +346,155 @@ void release_insert_state(struct insert_
 }
 
 static str
-insert_one_nul_terminated(struct insert_state *st, const char *item, size_t 
item_len)
+reinsert(struct insert_state *st, BUN bun)
+{
+       if (bun >= BATcount(st->bat))
+               throw(SQL, "insert_nul_terminated_values", SQLSTATE(42000) 
"invalid repeat bun " BUNFMT, bun);
+       if (BATcount(st->bat) == BATcapacity(st->bat)) {
+               if (BATextend(st->bat, BATgrows(st->bat)) != GDK_SUCCEED)
+                               throw(SQL, "insert_nul_terminated_values", 
GDK_EXCEPTION);
+       }
+       void *src = Tloc(st->bat, bun);
+       void *dst = Tloc(st->bat, BATcount(st->bat));
+       switch (st->bat->twidth) {
+               case 1:
+                       *(uint8_t*)dst = *(uint8_t*)src;
+                       break;
+               case 2:
+                       *(uint16_t*)dst = *(uint16_t*)src;
+                       break;
+               case 4:
+                       *(uint32_t*)dst = *(uint32_t*)src;
+                       break;
+               case 8:
+                       *(uint64_t*)dst = *(uint64_t*)src;
+                       break;
+               default:
+                       MT_UNREACHABLE();
+       }
+       st->bat->batCount++;
+       return MAL_SUCCEED;
+}
+
+static str
+insert_non_nil(struct insert_state *st, const char *item)
 {
        int tpe = BATttype(st->bat);
        const void *value;
 
-       assert(item_len > 0);
-       assert(item[item_len - 1] == '\0');
-
-       unsigned char *uitem = (unsigned char *)item;
-
-       if (uitem[0] == 0x80 && item_len == 2) {
-               value = ATOMnilptr(tpe);
+       if (!checkUTF8(item)) {
+               throw(SQL, "insert_nul_terminated_values", SQLSTATE(42000) 
"malformed utf-8 byte sequence");
+       }
+       if (tpe == TYPE_str) {
+               if (st->width > 0 && UTF8_strlen(item) > st->width)
+                       throw(SQL, "insert_nul_terminated_values", "string too 
wide for column");
+               value = item;
        } else {
-               if (!checkUTF8(item)) {
-                       throw(SQL, "insert_nul_terminated", SQLSTATE(42000) 
"malformed utf-8 byte sequence");
-               }
-               if (tpe == TYPE_str) {
-                       if (st->width > 0 && UTF8_strlen(item) > st->width)
-                               throw(SQL, "insert_nul_terminated", "string too 
wide for column");
-                       value = item;
-               } else {
-                       ssize_t n = ATOMfromstr(st->ma, tpe, &st->scratch, 
&st->schratch_len, item, false);
-                       if (n <= 0)
-                               throw(SQL, "insert_nul_terminated", 
GDK_EXCEPTION);
-                       value = st->scratch;
-               }
+               ssize_t n = ATOMfromstr(st->ma, tpe, &st->scratch, 
&st->schratch_len, item, false);
+               if (n <= 0)
+                       throw(SQL, "insert_nul_terminated_values", 
GDK_EXCEPTION);
+               value = st->scratch;
        }
 
        // By now 'value' has been set
        if (bunfastapp(st->bat, value) != GDK_SUCCEED) {
-               throw(SQL, "insert_nul_terminated", GDK_EXCEPTION);
+               throw(SQL, "insert_nul_terminated_values", GDK_EXCEPTION);
        }
 
        return MAL_SUCCEED;
 }
 
-str insert_some_nul_terminated(struct insert_state *st, const char *data, 
size_t total_len, size_t *consumed)
+static str
+insert_nil(struct insert_state *st)
 {
-       str msg;
-       assert(st->left_over <= total_len);
+       int tpe = BATttype(st->bat);
+       const void *value = ATOMnilptr(tpe);
+       if (bunfastapp(st->bat, value) != GDK_SUCCEED) {
+               throw(SQL, "insert_nul_terminated_values", GDK_EXCEPTION);
+       }
+       return MAL_SUCCEED;
+}
 
-       const char *start = data; // start of the current item
-       const char *pos = data + st->left_over; // where to start searching
-       const char *limit = data + total_len;
+str
+insert_nul_terminated_values(struct insert_state *st, const char *data, size_t 
total_len, size_t *consumed)
+{
+       assert(st->resume < total_len);
+
+       // We use unsigned char to make the backref computations easier
+       const unsigned char *start = (const unsigned char*)data;
+       const unsigned char *limit = start + total_len;
+       const unsigned char *current = start;   // start of the current item
+       const unsigned char *resume = current + st->resume; // where to start 
looking for NUL
 
-       while (pos < limit) {
-               const char *end = memchr(pos, '\0', limit - pos);
-               if (end == NULL)
-                       break;
-               size_t len = end + 1 - start;
-               msg = insert_one_nul_terminated(st, start, len);
-               if (msg != MAL_SUCCEED)
-                       return msg;
-               start += len;
-               pos = start;
+       while (current < limit) {
+               // Recognize the item and determine where it ends.
+               // If we reach 'limit' we'll goto end without updating 
'current'.
+               const unsigned char *pos = current;
+               const unsigned char first = *pos++;
+               if ((first & 0xC0) != 0x80) {
+                       // Not a nil, not a backref. Find out how long it is
+                       pos = memchr(resume, '\0', limit - resume);
+                       if (pos == NULL) {
+                               // the end of the string is not yet in our 
buffer
+                               resume = limit;
+                               goto end;
+                       }
+                       pos++; // include the NUL terminator
+                       str msg = insert_non_nil(st, (char*)current);
+                       if (msg != MAL_SUCCEED)
+                               return msg;
+               } else if (first > 0x80) {
+                       // 0x81 .. 0xBF, a short back ref
+                       assert(first <= 0xBF);
+                       BUN delta = first - 0x80;
+                       reinsert(st, BATcount(st->bat) - delta);
+               } else {
+                       // 0x80 so it's either a nil or a long backref
+                       assert(first == 0x80);
+                       if (pos == limit) {
+                               // can't tell the difference
+                               resume = current;
+                               goto end;
+                       }
+                       unsigned char follower = *pos++;
+                       if (follower == '\0') {
+                               // it's a nil
+                               str msg = insert_nil(st);
+                               if (msg != MAL_SUCCEED)
+                                       return msg;
+                       } else {
+                               // it's a long backref
+                               BUN delta = follower & 0x7F;
+                               unsigned int shift = 0;
+                               while (follower > 0x7F) {
+                                       if (pos == limit) {
+                                               // incomplete
+                                               resume = current;
+                                               goto end;
+                                       }
+                                       if (shift > 8 * sizeof(BUN) - 14) {
+                                               // the payload is 7 bits wide, 
if we increase shift
+                                               // by 7 it's going to overflow.
+                                               // TODO maybe we need to set a 
stricter limit?
+                                               throw(SQL, 
"insert_nul_terminated_values", SQLSTATE(42000)"invalid backref in binary data 
at %ld", (long)BATcount(st->bat));
+                                       }
+                                       shift += 7;
_______________________________________________
checkin-list mailing list -- [email protected]
To unsubscribe send an email to [email protected]

Reply via email to