Changeset: 471b665d1b0a for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/471b665d1b0a
Modified Files:
gdk/gdk_strimps.c
gdk/gdk_strimps.h
Branch: string_imprints
Log Message:
Finish refactoring
diffs (truncated from 857 to 300 lines):
diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -65,55 +65,7 @@
#include "gdk.h"
#include "gdk_private.h"
-#if 0
-/* Given a BAT return the number of digrams in it. The observation is
- * that the number of digrams is the number of characters - 1:
- *
- * 1 digram starting at character 1
- * 1 digram starting at character 2
- * [...]
- * 1 digram starting at character n - 1
- */
-gdk_return
-STRMPndigrams(BAT *b, size_t *n)
-{
- // lng t0;
- BUN i;
- BATiter bi;
- char *s;
- // GDKtracer_set_component_level("ALGO", "DEBUG");
- // struct canditer ci;
-
- // t0 = GDKusec();
- // BATcheck(b, NULL);
- assert(b->ttype == TYPE_str);
-
- bi = bat_iterator(b);
- *n = 0;
- for (i = 0; i < b->batCount; i++) {
- s = (char *)BUNtail(bi, i);
- // *n += STRMP_strlen(s) - 1;
- *n += strlen(s) - 1;
- // TRC_DEBUG(ALGO, "s["LLFMT"]=%s\n", i, s);
- }
-
- // TRC_DEBUG(ALGO, LLFMT "usec\n", GDKusec() - t0);
- // GDKtracer_flush_buffer();
-
- return GDK_SUCCEED;
-}
-#endif
-
-/* The isIgnored is a bit suspect in terms of unicode. There are
- * non-ASCII codepoints that are considered spaces, for example the
- * codepoints in the range U+2000-U+200f.
- */
-#define isIgnored(x) (isspace((x)) || isdigit((x)) || ispunct((x)))
-#define isNotIgnored(x) (!isIgnored(x))
-#define pairToIndex(b1, b2) (DataPair)(((uint8_t)b2)<<8 | ((uint8_t)b1))
-#define indexToPair2(idx) (idx & 0xff00) >> 8
-#define indexToPair1(idx) (idx & 0xff)
#define swp(_a, _i, _j, TPE) \
do { \
TPE _t = ((TPE *)_a)[_i]; \
@@ -121,112 +73,125 @@ STRMPndigrams(BAT *b, size_t *n)
((TPE *) _a)[_j] = _t; \
} while(0)
-/* This counts how many unicode codepoints the given string
- * contains.
+
+/* Macros for accessing metadada of a strimp. These are recorded in the
+ * first 8 bytes of the heap.
*/
-static size_t
-STRMP_utf8_strlen(const uint8_t *s)
-{
- size_t ret = 0;
- size_t i;
- int m,n;
- uint8_t c;
+#define NPAIRS(d) (((d) & (0xff << 8)) >> 8)
+#define HSIZE(d) (((d) & (0xffff << 16)) >> 16)
+
+#undef UTF8STRINGS /* Not using utf8 for now */
+#ifdef UTF8STRINGS
+static bool
+pair_equal(CharPair *p1, CharPair *p2) {
+ if(p1->psize != p2->psize)
+ return false;
+
+ for(size_t i = 0; i < p1->psize; i++)
+ if (*(p1->pbytes + i) != *(p2->pbytes + i))
+ return false;
+
+ return true;
+}
+#else
+/* BytePairs implementation */
+#define isIgnored(x) (isspace((x)) || isdigit((x)) || ispunct((x)))
+#define isNotIgnored(x) (!isIgnored(x))
+#define pairToIndex(b1, b2) (uint16_t)(((uint8_t)b2)<<8 | ((uint8_t)b1))
+#define indexToPair2(idx) (idx & 0xff00) >> 8
+#define indexToPair1(idx) (idx & 0xff)
+
+static bool
+pair_equal(CharPair *p1, CharPair *p2) {
+ return p1->pbytes[0] == p2->pbytes[0] &&
+ p1->pbytes[1] == p2->pbytes[1];
+
+}
- i = 0;
- while((c = *(s + i)) != 0) {
- if (c < 0x80)
- i++;
- else {
- for (n = 0, m=0x40; c & m; n++, m >>= 1)
- ;
- /* n is now the number of 10xxxxxx bytes that should
- follow. */
- if (n == 0 || n >= 4)
- /* TODO: handle invalid utf-8 */
- {}
- i += n+1;
- }
- ret++;
+static int64_t
+histogram_index(PairHistogramElem *hist, size_t hsize, CharPair *p) {
+ (void) hist;
+ (void) hsize;
+ return pairToIndex(p->pbytes[0], p->pbytes[1]);
+}
+
+static bool
+pair_at(PairIterator *pi, CharPair *p) {
+ if (pi->pos >= pi->lim)
+ return false;
+ p->pbytes = (uint8_t*)pi->s + pi->pos;
+ p->psize = 2;
+ return true;
+}
+
+static bool
+next_pair(PairIterator *pi) {
+ if (pi->pos >= pi->lim)
+ return false;
+ pi->pos++;
+ return true;
+}
+
+#endif // UTF8STRINGS
+
+static int8_t
+strimp_lookup(Strimps *s, CharPair *p) {
+ int8_t ret = -1;
+ size_t idx = 0;
+ size_t npairs = NPAIRS((uint64_t)s->strimps.base[0]);
+ size_t offset = 0;
+ CharPair sp;
+ (void)p;
+
+ for (idx = 0; idx < npairs; idx++) {
+ sp.psize = s->sizes_base[idx];
+ sp.pbytes = s->pairs_base + offset;
+ if (pair_equal(&sp, p))
+ return idx;
+ offset += sp.psize;
}
return ret;
}
-static size_t
-STRMP_utf8_next_char_index(const str s, size_t cidx) {
- assert(cidx < strlen(s)); /* !!! */
- if (*(s+cidx+1) == 0)
- return 0;
- return cidx + 1;
+static bool
+ignored(CharPair *p, uint8_t elm) {
+ assert(elm == 0 || elm == 1);
+ return isIgnored(p->pbytes[elm]);
}
-#if 0
-static uint8_t*
-STRMP_utf8_char_at(const str s, size_t i, size_t *len) {
- *len = 1;
- return (uint8_t *)(s + i);
-}
-#endif
-
-/* Construct a histogram of pairs of bytes in the input BAT.
- *
- * Return the histogram in hist and the number of non-zero bins in
- * count.
- */
-static gdk_return
-STRMPmakehistogramBP(BAT *b, uint64_t *hist, size_t hist_size, size_t *nbins)
-{
- lng t0=0;
- size_t hi;
- BUN i;
- BATiter bi;
- char *ptr, *s;
- /* uint64_t cur_min = 0; */
-
- TRC_DEBUG_IF(ALGO) t0 = GDKusec();
- assert(b->ttype == TYPE_str);
-
- for(hi = 0; hi < hist_size; hi++)
- hist[hi] = 0;
+#define MAX_PAIR_SIZE 8
- bi = bat_iterator(b);
- *nbins = 0;
- for(i = 0; i < b->batCount; i++) {
- s = (char *)BUNtvar(bi, i);
- if (!strNil(s)) {
- for(ptr = s; *ptr != 0 && *(ptr + 1) != 0; ptr++) {
- if (isIgnored(*(ptr+1))) {
- /* Skip this and the next pair
- * if the next char is ignored.
- */
- ptr++;
- }
- else if (isIgnored(*ptr)) {
- /* Skip this pair if the current
- * char is ignored. This should
- * only happen at the beginnig
- * of a string.
- */
- ;
- }
- else {
- hi = pairToIndex(*(ptr), *(ptr+1));
- assert(hi < hist_size);
- if (hist[hi] == 0)
- (*nbins)++;
- hist[hi]++;
- /* if (hist[hi] > cur_min) */
- /* cur_min = add_to_header(hi,
hist[hi]); */
- }
- }
- }
+/* Given a strimp header and a string compute the bitstring of which
+ * digrams(byte pairs) are present in the string. The strimp header is a
+ * map from digram(byte pair) to index in the strimp.
+ *
+ * This should probably be inlined.
+ */
+static uint64_t
+STRMPmakebitstring(const str s, Strimps *r)
+{
+ uint64_t ret = 0;
+ int8_t pair_idx = 0;
+ PairIterator pi;
+ CharPair cp;
+
+ pi.s = s;
+ pi.pos = 0;
+ pi.lim = strlen(s);
+
+ while(pair_at(&pi, &cp)) {
+ pair_idx = strimp_lookup(r, &cp);
+ if (pair_idx > 0)
+ ret |= 0x1 << pair_idx;
+ next_pair(&pi);
}
- TRC_DEBUG(ALGO, LLFMT " usec\n", GDKusec() - t0);
- GDKtracer_flush_buffer();
- return GDK_SUCCEED;
+ return ret;
}
+
+
/* Given a histogram find the indices of the STRIMP_HEADER_SIZE largest
* counts.
*
@@ -248,182 +213,162 @@ STRMPmakehistogramBP(BAT *b, uint64_t *h
* In the current implementation each index is a DataPair value that is
* constructed by pairToIndex from 2 consecutive bytes in the input.
*/
-static StrimpHeader *
-make_header(StrimpHeader *h, uint64_t* hist, size_t hist_size)
+static void
+STRMPchoosePairs(PairHistogramElem *hist, size_t hist_size, CharPair *cp)
{
lng t0 = 0;
size_t i;
uint64_t max_counts[STRIMP_HEADER_SIZE] = {0};
+ size_t indices[STRIMP_HEADER_SIZE] = {0};
const size_t cmin_max = STRIMP_HEADER_SIZE - 1;
size_t hidx;
- TRC_DEBUG_IF(ALGO) t0 = GDKusec();
-
- for(i = 0; i < STRIMP_HEADER_SIZE; i++)
- h->bytepairs[i] = 0;
+ TRC_DEBUG_IF(ACCELERATOR) t0 = GDKusec();
+ TRC_DEBUG(ACCELERATOR, "Pair selection\n");
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list