Changeset: 3977ea91b871 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/3977ea91b871
Added Files:
documentation/source/bincopy-backref.rst
Modified Files:
documentation/index.rst
sql/backends/monet5/sql_bincopyconvert.c
sql/backends/monet5/sql_bincopyconvert.h
Branch: copyfaster
Log Message:
Add backref encoding for strings in COPY BINARY
diffs (truncated from 363 to 300 lines):
diff --git a/documentation/index.rst b/documentation/index.rst
--- a/documentation/index.rst
+++ b/documentation/index.rst
@@ -27,6 +27,7 @@ Welcome to MonetDB's documentation!
source/design
source/input
source/json
+ source/bincopy-backref
source/cmake
source/release
source/developers_handbook
diff --git a/documentation/source/bincopy-backref.rst
b/documentation/source/bincopy-backref.rst
new file mode 100644
--- /dev/null
+++ b/documentation/source/bincopy-backref.rst
@@ -0,0 +1,91 @@
+*******************************************
+Backref encoding for strings in COPY BINARY
+*******************************************
+
+With COPY BINARY, a string column is encoded as the concatenation of the
+NUL-terminated strings. Nil is encoded as ``80 00``, that is,
+byte value 0x80=128 followed by byte value 0. This is safe because
+valid UTF-8 encodings never start with bytes in the ranges
+0x80..0xBF or 0xF8..0xFF.
+
+We have now extended the file format with a way to reuse earlier
+entries in the file. This is useful for two reasons
+
+1. It saves space because the backreference is almost always shorter
+ than the string would have been
+
+2. It saves time because inserting a string value into a BAT takes quite
+ some effort compared to inserting for example an integer, but reusing
+ a string that has been inserted before is about as cheap as inserting
+ an integer.
+
+The backreferences are always relative to the the current end of the
+BAT, so the most recently inserted entry is referred to as 1, the one
+before as 2, etc. We'll write this number as N. Note that N is never 0.
+
+If N<=63, the backreference is encoded as the single byte 0x80+N.
+It is not followed by a 0x00.
+
+If N >= 64, N is written as N = N0 + 128*N1 + 128^2*N2 + .. +128^i*Ni .
+The backreference is then encoded as::
+
+ 0x80 0x80+N0 .. 0x80+N(i-1) + Ni
+
+For example, N = 500 = 116 + 3 * 128 is encoded as::
+
+ 0x80 0xF4 0x03
+
+Again there is no trailing 00.
+
+Sample Python code::
+
+ def backref_encode(strings_or_nones):
+ memory = {}
+ prev_nil = -64
+ for i, s in enumerate(strings_or_nones):
+ if s is not None:
+ prev = memory.get(s)
+ memory[s] = i # always prefer latest
+ if prev is None:
+ # nul-terminated string
+ yield bytes(s, 'utf-8') + b'\x00'
+ else:
+ delta = i - prev
+ if delta < 64:
+ # short backref
+ yield bytes([0x80 + delta])
+ else:
+ # long backref
+ buf = bytearray([0x80])
+ while delta > 0:
+ chunk = delta % 128
+ if chunk < delta:
+ chunk += 0x80 # not the last
+ buf.append(chunk)
+ delta = delta // 128
+ yield buf
+ else:
+ delta = i - prev_nil
+ prev_nil = i
+ if delta < 64:
+ # short backref to earlier nil
+ yield bytes([0x80 + delta])
+ else:
+ # no space advantage, just encode the nil
+ yield b'\x80\x00'
+
+
+ input = ['foo', None, None, 'foo']
+ expected = b'foo\x00' + b'\x80\x00' + b'\x81' + b'\x83'
+ actual = b''.join(backref_encode(input))
+ assert actual == expected
+
+ input = ['foo', None] + 500 * ['bar'] + ['foo', None]
+ expected = (
+ (b'foo\x00' + b'\x80\x00')
+ + (b'bar\x00' + 499 * b'\x81')
+ # 502 == 118 + 3 * 128; 118 + 128 == 0xF6
+ + (b'\x80\xF6\x03' + b'\x80\x00')
+ )
+ actual = b''.join(backref_encode(input))
+ assert actual == expected
diff --git a/sql/backends/monet5/sql_bincopyconvert.c
b/sql/backends/monet5/sql_bincopyconvert.c
--- a/sql/backends/monet5/sql_bincopyconvert.c
+++ b/sql/backends/monet5/sql_bincopyconvert.c
@@ -330,12 +330,12 @@ encode_timestamp(void *dst_, void *src_,
void init_insert_state(struct insert_state *st, allocator *ma, BAT *bat, int
width) {
*st = (struct insert_state) {
+ .ma = ma,
.bat = bat,
.width = width,
.scratch = NULL,
.schratch_len = 0,
- .left_over = 0,
- .ma = ma,
+ .resume = 0,
};
};
@@ -346,65 +346,155 @@ void release_insert_state(struct insert_
}
static str
-insert_one_nul_terminated(struct insert_state *st, const char *item, size_t
item_len)
+reinsert(struct insert_state *st, BUN bun)
+{
+ if (bun >= BATcount(st->bat))
+ throw(SQL, "insert_nul_terminated_values", SQLSTATE(42000)
"invalid repeat bun " BUNFMT, bun);
+ if (BATcount(st->bat) == BATcapacity(st->bat)) {
+ if (BATextend(st->bat, BATgrows(st->bat)) != GDK_SUCCEED)
+ throw(SQL, "insert_nul_terminated_values",
GDK_EXCEPTION);
+ }
+ void *src = Tloc(st->bat, bun);
+ void *dst = Tloc(st->bat, BATcount(st->bat));
+ switch (st->bat->twidth) {
+ case 1:
+ *(uint8_t*)dst = *(uint8_t*)src;
+ break;
+ case 2:
+ *(uint16_t*)dst = *(uint16_t*)src;
+ break;
+ case 4:
+ *(uint32_t*)dst = *(uint32_t*)src;
+ break;
+ case 8:
+ *(uint64_t*)dst = *(uint64_t*)src;
+ break;
+ default:
+ MT_UNREACHABLE();
+ }
+ st->bat->batCount++;
+ return MAL_SUCCEED;
+}
+
+static str
+insert_non_nil(struct insert_state *st, const char *item)
{
int tpe = BATttype(st->bat);
const void *value;
- assert(item_len > 0);
- assert(item[item_len - 1] == '\0');
-
- unsigned char *uitem = (unsigned char *)item;
-
- if (uitem[0] == 0x80 && item_len == 2) {
- value = ATOMnilptr(tpe);
+ if (!checkUTF8(item)) {
+ throw(SQL, "insert_nul_terminated_values", SQLSTATE(42000)
"malformed utf-8 byte sequence");
+ }
+ if (tpe == TYPE_str) {
+ if (st->width > 0 && UTF8_strlen(item) > st->width)
+ throw(SQL, "insert_nul_terminated_values", "string too
wide for column");
+ value = item;
} else {
- if (!checkUTF8(item)) {
- throw(SQL, "insert_nul_terminated", SQLSTATE(42000)
"malformed utf-8 byte sequence");
- }
- if (tpe == TYPE_str) {
- if (st->width > 0 && UTF8_strlen(item) > st->width)
- throw(SQL, "insert_nul_terminated", "string too
wide for column");
- value = item;
- } else {
- ssize_t n = ATOMfromstr(st->ma, tpe, &st->scratch,
&st->schratch_len, item, false);
- if (n <= 0)
- throw(SQL, "insert_nul_terminated",
GDK_EXCEPTION);
- value = st->scratch;
- }
+ ssize_t n = ATOMfromstr(st->ma, tpe, &st->scratch,
&st->schratch_len, item, false);
+ if (n <= 0)
+ throw(SQL, "insert_nul_terminated_values",
GDK_EXCEPTION);
+ value = st->scratch;
}
// By now 'value' has been set
if (bunfastapp(st->bat, value) != GDK_SUCCEED) {
- throw(SQL, "insert_nul_terminated", GDK_EXCEPTION);
+ throw(SQL, "insert_nul_terminated_values", GDK_EXCEPTION);
}
return MAL_SUCCEED;
}
-str insert_some_nul_terminated(struct insert_state *st, const char *data,
size_t total_len, size_t *consumed)
+static str
+insert_nil(struct insert_state *st)
{
- str msg;
- assert(st->left_over <= total_len);
+ int tpe = BATttype(st->bat);
+ const void *value = ATOMnilptr(tpe);
+ if (bunfastapp(st->bat, value) != GDK_SUCCEED) {
+ throw(SQL, "insert_nul_terminated_values", GDK_EXCEPTION);
+ }
+ return MAL_SUCCEED;
+}
- const char *start = data; // start of the current item
- const char *pos = data + st->left_over; // where to start searching
- const char *limit = data + total_len;
+str
+insert_nul_terminated_values(struct insert_state *st, const char *data, size_t
total_len, size_t *consumed)
+{
+ assert(st->resume < total_len);
+
+ // We use unsigned char to make the backref computations easier
+ const unsigned char *start = (const unsigned char*)data;
+ const unsigned char *limit = start + total_len;
+ const unsigned char *current = start; // start of the current item
+ const unsigned char *resume = current + st->resume; // where to start
looking for NUL
- while (pos < limit) {
- const char *end = memchr(pos, '\0', limit - pos);
- if (end == NULL)
- break;
- size_t len = end + 1 - start;
- msg = insert_one_nul_terminated(st, start, len);
- if (msg != MAL_SUCCEED)
- return msg;
- start += len;
- pos = start;
+ while (current < limit) {
+ // Recognize the item and determine where it ends.
+ // If we reach 'limit' we'll goto end without updating
'current'.
+ const unsigned char *pos = current;
+ const unsigned char first = *pos++;
+ if ((first & 0xC0) != 0x80) {
+ // Not a nil, not a backref. Find out how long it is
+ pos = memchr(resume, '\0', limit - resume);
+ if (pos == NULL) {
+ // the end of the string is not yet in our
buffer
+ resume = limit;
+ goto end;
+ }
+ pos++; // include the NUL terminator
+ str msg = insert_non_nil(st, (char*)current);
+ if (msg != MAL_SUCCEED)
+ return msg;
+ } else if (first > 0x80) {
+ // 0x81 .. 0xBF, a short back ref
+ assert(first <= 0xBF);
+ BUN delta = first - 0x80;
+ reinsert(st, BATcount(st->bat) - delta);
+ } else {
+ // 0x80 so it's either a nil or a long backref
+ assert(first == 0x80);
+ if (pos == limit) {
+ // can't tell the difference
+ resume = current;
+ goto end;
+ }
+ unsigned char follower = *pos++;
+ if (follower == '\0') {
+ // it's a nil
+ str msg = insert_nil(st);
+ if (msg != MAL_SUCCEED)
+ return msg;
+ } else {
+ // it's a long backref
+ BUN delta = follower & 0x7F;
+ unsigned int shift = 0;
+ while (follower > 0x7F) {
+ if (pos == limit) {
+ // incomplete
+ resume = current;
+ goto end;
+ }
+ if (shift > 8 * sizeof(BUN) - 14) {
+ // the payload is 7 bits wide,
if we increase shift
+ // by 7 it's going to overflow.
+ // TODO maybe we need to set a
stricter limit?
+ throw(SQL,
"insert_nul_terminated_values", SQLSTATE(42000)"invalid backref in binary data
at %ld", (long)BATcount(st->bat));
+ }
+ shift += 7;
_______________________________________________
checkin-list mailing list -- [email protected]
To unsubscribe send an email to [email protected]