Changeset: 471b665d1b0a for MonetDB
URL: https://dev.monetdb.org/hg/MonetDB/rev/471b665d1b0a
Modified Files:
        gdk/gdk_strimps.c
        gdk/gdk_strimps.h
Branch: string_imprints
Log Message:

Finish refactoring


diffs (truncated from 857 to 300 lines):

diff --git a/gdk/gdk_strimps.c b/gdk/gdk_strimps.c
--- a/gdk/gdk_strimps.c
+++ b/gdk/gdk_strimps.c
@@ -65,55 +65,7 @@
 #include "gdk.h"
 #include "gdk_private.h"
 
-#if 0
 
-/* Given a BAT return the number of digrams in it. The observation is
- * that the number of digrams is the number of characters - 1:
- *
- * 1 digram starting at character 1
- * 1 digram starting at character 2
- * [...]
- * 1 digram starting at character n - 1
- */
-gdk_return
-STRMPndigrams(BAT *b, size_t *n)
-{
-       // lng t0;
-       BUN i;
-       BATiter bi;
-       char *s;
-       // GDKtracer_set_component_level("ALGO", "DEBUG");
-       // struct canditer ci;
-
-       // t0 = GDKusec();
-       // BATcheck(b, NULL);
-       assert(b->ttype == TYPE_str);
-
-       bi = bat_iterator(b);
-       *n = 0;
-       for (i = 0; i < b->batCount; i++) {
-               s = (char *)BUNtail(bi, i);
-                // *n += STRMP_strlen(s) - 1;
-               *n += strlen(s) - 1;
-               // TRC_DEBUG(ALGO, "s["LLFMT"]=%s\n", i, s);
-       }
-
-       // TRC_DEBUG(ALGO, LLFMT "usec\n", GDKusec() - t0);
-       // GDKtracer_flush_buffer();
-
-       return GDK_SUCCEED;
-}
-#endif
-
-/* The isIgnored is a bit suspect in terms of unicode. There are
- * non-ASCII codepoints that are considered spaces, for example the
- * codepoints in the range U+2000-U+200f.
- */
-#define isIgnored(x) (isspace((x)) || isdigit((x)) || ispunct((x)))
-#define isNotIgnored(x) (!isIgnored(x))
-#define pairToIndex(b1, b2) (DataPair)(((uint8_t)b2)<<8 | ((uint8_t)b1))
-#define indexToPair2(idx) (idx & 0xff00) >> 8
-#define indexToPair1(idx) (idx & 0xff)
 #define swp(_a, _i, _j, TPE)                   \
        do {                                    \
                TPE _t = ((TPE *)_a)[_i];       \
@@ -121,112 +73,125 @@ STRMPndigrams(BAT *b, size_t *n)
                ((TPE *) _a)[_j] = _t;                  \
        } while(0)
 
-/* This counts how many unicode codepoints the given string
- * contains.
+
+/* Macros for accessing metadada of a strimp. These are recorded in the
+ * first 8 bytes of the heap.
  */
-static size_t
-STRMP_utf8_strlen(const uint8_t *s)
-{
-       size_t ret = 0;
-       size_t i;
-       int m,n;
-       uint8_t c;
+#define NPAIRS(d) (((d) & (0xff << 8)) >> 8)
+#define HSIZE(d) (((d) & (0xffff << 16)) >> 16)
+
+#undef UTF8STRINGS             /* Not using utf8 for now */
+#ifdef UTF8STRINGS
+static bool
+pair_equal(CharPair *p1, CharPair *p2) {
+       if(p1->psize != p2->psize)
+               return false;
+
+       for(size_t i = 0; i < p1->psize; i++)
+               if (*(p1->pbytes + i) != *(p2->pbytes + i))
+                       return false;
+
+       return true;
+}
+#else
+/* BytePairs implementation */
+#define isIgnored(x) (isspace((x)) || isdigit((x)) || ispunct((x)))
+#define isNotIgnored(x) (!isIgnored(x))
+#define pairToIndex(b1, b2) (uint16_t)(((uint8_t)b2)<<8 | ((uint8_t)b1))
+#define indexToPair2(idx) (idx & 0xff00) >> 8
+#define indexToPair1(idx) (idx & 0xff)
+
+static bool
+pair_equal(CharPair *p1, CharPair *p2) {
+       return p1->pbytes[0] == p2->pbytes[0] &&
+               p1->pbytes[1] == p2->pbytes[1];
+
+}
 
-       i = 0;
-       while((c = *(s + i)) != 0) {
-               if (c < 0x80)
-                       i++;
-               else {
-                       for (n = 0, m=0x40; c & m; n++, m >>= 1)
-                               ;
-                       /* n is now the number of 10xxxxxx bytes that should
-                          follow. */
-                       if (n == 0 || n >= 4)
-                               /* TODO: handle invalid utf-8 */
-                               {}
-                       i += n+1;
-               }
-               ret++;
+static int64_t
+histogram_index(PairHistogramElem *hist, size_t hsize, CharPair *p) {
+       (void) hist;
+       (void) hsize;
+       return pairToIndex(p->pbytes[0], p->pbytes[1]);
+}
+
+static bool
+pair_at(PairIterator *pi, CharPair *p) {
+       if (pi->pos >= pi->lim)
+               return false;
+       p->pbytes = (uint8_t*)pi->s + pi->pos;
+       p->psize = 2;
+       return true;
+}
+
+static bool
+next_pair(PairIterator *pi) {
+       if (pi->pos >= pi->lim)
+               return false;
+       pi->pos++;
+       return true;
+}
+
+#endif // UTF8STRINGS
+
+static int8_t
+strimp_lookup(Strimps *s, CharPair *p) {
+       int8_t ret = -1;
+       size_t idx = 0;
+       size_t npairs = NPAIRS((uint64_t)s->strimps.base[0]);
+       size_t offset = 0;
+       CharPair sp;
+       (void)p;
+
+       for (idx = 0; idx < npairs; idx++) {
+               sp.psize = s->sizes_base[idx];
+               sp.pbytes = s->pairs_base + offset;
+               if (pair_equal(&sp, p))
+                       return idx;
+               offset += sp.psize;
        }
 
        return ret;
 }
 
-static size_t
-STRMP_utf8_next_char_index(const str s, size_t cidx) {
-       assert(cidx < strlen(s)); /* !!! */
-       if (*(s+cidx+1) == 0)
-               return 0;
-       return cidx + 1;
+static bool
+ignored(CharPair *p, uint8_t elm) {
+       assert(elm == 0 || elm == 1);
+       return isIgnored(p->pbytes[elm]);
 }
 
-#if 0
-static uint8_t*
-STRMP_utf8_char_at(const str s, size_t i, size_t *len) {
-       *len = 1;
-       return (uint8_t *)(s + i);
-}
-#endif
-
-/* Construct a histogram of pairs of bytes in the input BAT.
- *
- * Return the histogram in hist and the number of non-zero bins in
- * count.
- */
-static gdk_return
-STRMPmakehistogramBP(BAT *b, uint64_t *hist, size_t hist_size, size_t *nbins)
-{
-       lng t0=0;
-       size_t hi;
-       BUN i;
-       BATiter bi;
-       char *ptr, *s;
-       /* uint64_t cur_min = 0; */
-
-       TRC_DEBUG_IF(ALGO) t0 = GDKusec();
-       assert(b->ttype == TYPE_str);
-
-       for(hi = 0; hi < hist_size; hi++)
-               hist[hi] = 0;
+#define MAX_PAIR_SIZE 8
 
-       bi = bat_iterator(b);
-       *nbins = 0;
-       for(i = 0; i < b->batCount; i++) {
-               s = (char *)BUNtvar(bi, i);
-               if (!strNil(s)) {
-                       for(ptr = s; *ptr != 0 && *(ptr + 1) != 0; ptr++) {
-                               if (isIgnored(*(ptr+1))) {
-                                       /* Skip this and the next pair
-                                        * if the next char is ignored.
-                                        */
-                                       ptr++;
-                               }
-                               else if (isIgnored(*ptr)) {
-                                       /* Skip this pair if the current
-                                        * char is ignored. This should
-                                        * only happen at the beginnig
-                                        * of a string.
-                                        */
-                                       ;
-                               }
-                               else {
-                                       hi = pairToIndex(*(ptr), *(ptr+1));
-                                       assert(hi < hist_size);
-                                       if (hist[hi] == 0)
-                                               (*nbins)++;
-                                       hist[hi]++;
-                                       /* if (hist[hi] > cur_min) */
-                                       /*      cur_min = add_to_header(hi, 
hist[hi]); */
-                               }
-                       }
-               }
+/* Given a strimp header and a string compute the bitstring of which
+ * digrams(byte pairs) are present in the string. The strimp header is a
+ * map from digram(byte pair) to index in the strimp.
+ *
+ * This should probably be inlined.
+ */
+static uint64_t
+STRMPmakebitstring(const str s, Strimps *r)
+{
+       uint64_t ret = 0;
+       int8_t pair_idx = 0;
+       PairIterator pi;
+       CharPair cp;
+
+       pi.s = s;
+       pi.pos = 0;
+       pi.lim = strlen(s);
+
+       while(pair_at(&pi, &cp)) {
+               pair_idx = strimp_lookup(r, &cp);
+               if (pair_idx > 0)
+                       ret |= 0x1 << pair_idx;
+               next_pair(&pi);
        }
 
-       TRC_DEBUG(ALGO, LLFMT " usec\n", GDKusec() - t0);
-       GDKtracer_flush_buffer();
-       return GDK_SUCCEED;
+       return ret;
 }
 
+
+
 /* Given a histogram find the indices of the STRIMP_HEADER_SIZE largest
  * counts.
  *
@@ -248,182 +213,162 @@ STRMPmakehistogramBP(BAT *b, uint64_t *h
  * In the current implementation each index is a DataPair value that is
  * constructed by pairToIndex from 2 consecutive bytes in the input.
  */
-static StrimpHeader *
-make_header(StrimpHeader *h, uint64_t* hist, size_t hist_size)
+static void
+STRMPchoosePairs(PairHistogramElem *hist, size_t hist_size, CharPair *cp)
 {
        lng t0 = 0;
        size_t i;
        uint64_t max_counts[STRIMP_HEADER_SIZE] = {0};
+       size_t indices[STRIMP_HEADER_SIZE] = {0};
        const size_t cmin_max = STRIMP_HEADER_SIZE - 1;
        size_t hidx;
 
-       TRC_DEBUG_IF(ALGO) t0 = GDKusec();
-
-       for(i = 0; i < STRIMP_HEADER_SIZE; i++)
-               h->bytepairs[i] = 0;
+       TRC_DEBUG_IF(ACCELERATOR) t0 = GDKusec();
+       TRC_DEBUG(ACCELERATOR, "Pair selection\n");
 
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to