Changeset: f46a719af133 for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/f46a719af133
Modified Files:
gdk/gdk_strimps.c
Branch: string_imprints
Log Message:
Construct strimp descriptor correctly
diffs (92 lines):
diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -20,7 +20,7 @@
* byte (64 bit) words.
*
* The first 64 bit word describes how the header of the strimp is
- * encoded. The most significant byte (v in the schematic below) is the
+ * encoded. The least significant byte (v in the schematic below) is the
* version number. The second (np) is the number of pairs in the
* header. The third (b/p) is the number of bytes per pair if each pair
* is encoded using a constant number of bytes or 0 if it is utf-8. The
@@ -205,22 +205,23 @@ GDKstrimp_make_histogram(BAT *b, uint64_
return GDK_SUCCEED;
}
-/* Given a histogram find the indices of the 64 largest counts.
+/* Given a histogram find the indices of the STRIMP_HEADER_SIZE largest
+ * counts.
*
* We make one scan of histogram and every time we find a count that is
- * greater than the current minimum of the 64, we bubble it up in the
- * header until we find a count that is greater. We carry the index in
- * the histogram because this is the information we are actually
- * interested in keeping.
+ * greater than the current minimum of the STRIMP_HEADER_SIZE, we bubble
+ * it up in the header until we find a count that is greater. We carry
+ * the index in the histogram because this is the information we are
+ * actually interested in keeping.
*
- * At the end of this process we have the indices of 64 largest counts
- * in the histogram. This process is O(n) in time since we are doing
- * constant work (at most 63 comparisons and swaps) for each item in the
- * histogram and as such is (theoretically) more efficient than sorting
- * (O(nlog n))and taking the 64 largest elements. This depends on the
- * size of the histogram n. For some small n sorting might be more
- * efficient, but for such inputs the difference should not be
- * noticeable.
+ * At the end of this process we have the indices of STRIMP_HEADER_SIZE
+ * largest counts in the histogram. This process is O(n) in time since
+ * we are doing constant work (at most 63 comparisons and swaps) for
+ * each item in the histogram and as such is (theoretically) more
+ * efficient than sorting (O(nlog n))and taking the STRIMP_HEADER_SIZE
+ * largest elements. This depends on the size of the histogram n. For
+ * some small n sorting might be more efficient, but for such inputs the
+ * difference should not be noticeable.
*
* In the current implementation each index is a DataPair value that is
* constructed by pairToIndex from 2 consecutive bytes in the input.
@@ -328,6 +329,9 @@ create_strimp_heap(BAT *b, StrimpHeader
{
Heap *r = NULL;
uint64_t *d;
+ uint64_t descriptor;
+ uint8_t npairs, bytes_per_pair;
+ uint16_t hsize;
size_t i,j;
const char *nme;
@@ -342,7 +346,17 @@ create_strimp_heap(BAT *b, StrimpHeader
}
r->free = STRIMP_OFFSET * sizeof(uint64_t);
+ npairs = STRIMP_HEADER_SIZE;
+ bytes_per_pair = 2; /* Bytepair implementation */
+ hsize = sizeof(h->bytepairs);
+
+ assert(bytes_per_pair == 0 || npairs*bytes_per_pair == hsize);
+
+ descriptor = 0;
+ descriptor = STRIMP_VERSION | npairs << 8 | bytes_per_pair << 16 |
hsize << 24;
+
d = (uint64_t *)r->base;
+ *d++ = descriptor;
/* This loop assumes that we are working with byte pairs
* (i.e. the type of the header is uint16_t). TODO: generalize.
*/
@@ -352,7 +366,14 @@ create_strimp_heap(BAT *b, StrimpHeader
*d <<= 16;
*d |= h->bytepairs[i + j];
}
+ d++;
}
+#ifndef NDEBUG
+ FILE *fp = fopen("/tmp/foo.strimp", "wb");
+ fwrite(r->base, sizeof(uint64_t), STRIMP_HEADER_SIZE/4 + 1, fp);
+ fclose(fp);
+#endif
+
return r;
}
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list