Changeset: 09495d868d13 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=09495d868d13
Modified Files:
monetdb5/modules/mal/tokenizer.c
Branch: default
Log Message:
Ported tokenizer from rdf branch to default branch.
Unfortunately it is not possible to graft a changeset, so we just made
a copy of tokenizer.c from changeset 3e4ece2b7085 in the rdf branch.
Still to be done: we need upgrade code since the tokenizer uses
persistent bats that have now been split into two. These bats can be
recognized by their names.
diffs (truncated from 360 to 300 lines):
diff --git a/monetdb5/modules/mal/tokenizer.c b/monetdb5/modules/mal/tokenizer.c
--- a/monetdb5/modules/mal/tokenizer.c
+++ b/monetdb5/modules/mal/tokenizer.c
@@ -56,7 +56,11 @@
#define MAX_TKNZR_DEPTH 256
#define INDEX MAX_TKNZR_DEPTH
static int tokenDepth = 0;
-static BAT *tokenBAT[MAX_TKNZR_DEPTH + 1];
+//static BAT *tokenBAT[MAX_TKNZR_DEPTH + 1];
+struct {
+ BAT *idx, *val;
+} tokenBAT[MAX_TKNZR_DEPTH + 1];
+
static BAT *TRANS = NULL; /* the catalog of tokenizers */
static char name[128];
@@ -70,16 +74,18 @@ static char name[128];
#define GET_d(x) ((sht) ((x) & 255))
#define GET_h(x) ((x) >> 8)
-static int prvlocate(BAT* b, oid *prv, str part)
+static int prvlocate(BAT* b, BAT* bidx, oid *prv, str part)
{
BAT *m = BATmirror(b);
BATiter mi = bat_iterator(m);
+ BATiter biidx = bat_iterator(bidx);
+
BUN p;
if (m->H->hash == NULL)
BAThash(m, 2 * BATcount(m));
HASHloop_str(mi, m->H->hash, p, part)
{
- if (*((oid *) BUNtail(mi, p)) == *prv) {
+ if (*((oid *) BUNtail(biidx, p)) == *prv) {
*prv = (oid) p;
return TRUE;
}
@@ -107,7 +113,9 @@ TKNZRopen(int *ret, str *in)
}
for (depth = 0; depth < MAX_TKNZR_DEPTH; depth++) {
- tokenBAT[depth] = 0;
+ //tokenBAT[depth] = 0;
+ tokenBAT[depth].idx = 0;
+ tokenBAT[depth].val = 0;
}
tokenDepth = 0;
@@ -134,14 +142,17 @@ TKNZRopen(int *ret, str *in)
throw(MAL, "tokenizer.open", MAL_MALLOC_FAIL);
BATkey(b, FALSE);
BATseqbase(b, 0);
- tokenBAT[INDEX] = b;
+ //tokenBAT[INDEX] = b;
+ tokenBAT[INDEX].val = b;
if (BKCsetName(&r, (int *) &(b->batCacheid), (str *) &batname)
!= MAL_SUCCEED)
throw(MAL, "tokenizer.open", OPERATION_FAILED);
if (BKCsetPersistent(&r, (int *) &(b->batCacheid)) !=
MAL_SUCCEED)
throw(MAL, "tokenizer.open", OPERATION_FAILED);
BUNappend(TRANS, batname, FALSE);
} else { /* existing tokenizer */
- tokenBAT[INDEX] = BATdescriptor(idx);
+ //tokenBAT[INDEX] = BATdescriptor(idx);
+ tokenBAT[INDEX].val = BATdescriptor(idx);
+
BUNappend(TRANS, batname, FALSE);
for (depth = 0; depth < MAX_TKNZR_DEPTH; depth++) {
@@ -149,8 +160,17 @@ TKNZRopen(int *ret, str *in)
idx = BBPindex(batname);
if (idx == 0)
break;
- tokenBAT[depth] = BATdescriptor(idx);
+ tokenBAT[depth].val = BATdescriptor(idx);
BUNappend(TRANS, batname, FALSE);
+
+ //For idx BATs
+ snprintf(batname, 132, "%s_idx_%d", name, depth);
+ idx = BBPindex(batname);
+ if (idx == 0)
+ break;
+ tokenBAT[depth].idx = BATdescriptor(idx);
+ BUNappend(TRANS, batname, FALSE);
+
}
tokenDepth = depth;
}
@@ -171,10 +191,11 @@ TKNZRclose(int *r)
TMsubcommit(TRANS);
for (i = 0; i < tokenDepth; i++) {
- BBPunfix(tokenBAT[i]->batCacheid);
+ BBPunfix(tokenBAT[i].idx->batCacheid);
+ BBPunfix(tokenBAT[i].val->batCacheid);
}
- BBPunfix(tokenBAT[INDEX]->batCacheid);
-
+ //BBPunfix(tokenBAT[INDEX].idx->batCacheid);
+ BBPunfix(tokenBAT[INDEX].val->batCacheid);
tokenDepth = 0;
BBPreclaim(TRANS);
@@ -217,7 +238,8 @@ TKNZRappend(oid *pos, str *s)
str batname;
str parts[MAX_TKNZR_DEPTH];
int i, new, r, depth;
- BAT *b;
+ BAT *bVal;
+ BAT *bIdx;
BUN p;
BUN idx = 0;
oid prv = 0;
@@ -242,44 +264,76 @@ TKNZRappend(oid *pos, str *s)
throw(MAL, "tokenizer",
ILLEGAL_ARGUMENT "input string breaks to too
many parts");
}
- if (depth > tokenDepth || tokenBAT[0] == NULL) {
+ if (depth > tokenDepth || tokenBAT[0].val == NULL) {
new = tokenDepth;
for (i = tokenDepth; i < depth; i++) {
- /* make new bat */
- batname = (str) GDKmalloc(128 * sizeof(char));
+ /* make new bat for value */
+ batname = (str) GDKmalloc(132 * sizeof(char));
snprintf(batname, 128, "%s_%d", name, i);
- b = BATnew(TYPE_oid, TYPE_str, 1024);
- if (b == NULL) {
+ bVal = BATnew(TYPE_void, TYPE_str, 1024);
+ if (bVal == NULL) {
GDKfree(batname);
GDKfree(url);
throw(MAL, "tokenizer.append", MAL_MALLOC_FAIL);
}
- BATkey(b, FALSE);
- tokenBAT[i] = b;
+ BATkey(bVal, FALSE);
+ BATseqbase(bVal, 0);
+
+ tokenBAT[i].val = bVal;
- if (BKCsetName(&r, (int *) &(b->batCacheid), (str *)
&batname)
+ if (BKCsetName(&r, (int *) &(bVal->batCacheid), (str *)
&batname)
!= MAL_SUCCEED) {
GDKfree(batname);
GDKfree(url);
throw(MAL, "tokenizer.open", OPERATION_FAILED);
}
- if (BKCsetPersistent(&r, (int *) &(b->batCacheid))
+ if (BKCsetPersistent(&r, (int *) &(bVal->batCacheid))
!= MAL_SUCCEED) {
GDKfree(batname);
GDKfree(url);
throw(MAL, "tokenizer.open", OPERATION_FAILED);
}
BUNappend(TRANS, batname, FALSE);
+
+ /* make new bat for index */
+ snprintf(batname, 132, "%s_idx_%d", name, i);
+ bIdx = BATnew(TYPE_void, TYPE_oid, 1024);
+ if (bIdx == NULL) {
+ GDKfree(batname);
+ GDKfree(url);
+ throw(MAL, "tokenizer.append", MAL_MALLOC_FAIL);
+ }
+ BATkey(bIdx, FALSE);
+ BATseqbase(bIdx, 0);
+
+ tokenBAT[i].idx = bIdx;
+
+ if (BKCsetName(&r, (int *) &(bIdx->batCacheid), (str *)
&batname)
+ != MAL_SUCCEED) {
+ GDKfree(batname);
+ GDKfree(url);
+ throw(MAL, "tokenizer.open", OPERATION_FAILED);
+ }
+ if (BKCsetPersistent(&r, (int *) &(bIdx->batCacheid))
+ != MAL_SUCCEED) {
+ GDKfree(batname);
+ GDKfree(url);
+ throw(MAL, "tokenizer.open", OPERATION_FAILED);
+ }
+ BUNappend(TRANS, batname, FALSE);
+
+
GDKfree(batname);
}
tokenDepth = depth;
}
+
/* findcommn */
- p = BUNfnd(BATmirror(tokenBAT[0]), parts[0]);
+ p = BUNfnd(BATmirror(tokenBAT[0].val), parts[0]);
if (p != BUN_NONE) {
prv = (oid) p;
for (i = 1; i < new; i++) {
- if (!prvlocate(tokenBAT[i], &prv, parts[i]))
+ if (!prvlocate(tokenBAT[i].val, tokenBAT[i].idx, &prv,
parts[i]))
break;
}
} else {
@@ -288,9 +342,10 @@ TKNZRappend(oid *pos, str *s)
if (i == depth) {
comp = COMP(prv, depth);
- *pos = BUNfnd(BATmirror(tokenBAT[INDEX]), (ptr) & comp);
+ *pos = BUNfnd(BATmirror(tokenBAT[INDEX].val), (ptr) & comp);
if (*pos != BUN_NONE) {
/* the string is already there */
+ //printf("The string %s is already there",url);
GDKfree(url);
return MAL_SUCCEED;
}
@@ -298,33 +353,41 @@ TKNZRappend(oid *pos, str *s)
/* insremainder */
for (; i < depth; i++) {
- idx = BATcount(tokenBAT[i]);
+ idx = BATcount(tokenBAT[i].val);
if (idx > MAX_h) {
GDKfree(url);
throw(MAL, "tokenizer.append",
OPERATION_FAILED " no more free oid's");
}
- tokenBAT[i] = BUNins(tokenBAT[i], (ptr) & prv, parts[i], FALSE);
- if (tokenBAT[i] == NULL) {
+ tokenBAT[i].val = BUNappend(tokenBAT[i].val, parts[i], TRUE);
+ if (tokenBAT[i].val == NULL) {
GDKfree(url);
throw(MAL, "tokenizer.append",
OPERATION_FAILED " could not append");
}
- if (tokenBAT[i]->T->hash == NULL ||
- BATcount(tokenBAT[i]) > 4 * tokenBAT[i]->T->hash->mask)
{
- HASHdestroy(tokenBAT[i]);
- BAThash(BATmirror(tokenBAT[i]), 2 *
BATcount(tokenBAT[i]));
+ if (tokenBAT[i].val->T->hash == NULL ||
+ BATcount(tokenBAT[i].val) > 4 *
tokenBAT[i].val->T->hash->mask) {
+ HASHdestroy(tokenBAT[i].val);
+ BAThash(BATmirror(tokenBAT[i].val), 2 *
BATcount(tokenBAT[i].val));
}
+
+ tokenBAT[i].idx = BUNappend(tokenBAT[i].idx, (ptr) & prv, TRUE);
+ if (tokenBAT[i].idx == NULL) {
+ GDKfree(url);
+ throw(MAL, "tokenizer.append",
+ OPERATION_FAILED " could not append");
+ }
+
prv = (oid) idx;
}
- *pos = (oid) BATcount(tokenBAT[INDEX]);
+ *pos = (oid) BATcount(tokenBAT[INDEX].val);
comp = COMP(prv, depth);
- BUNappend(tokenBAT[INDEX], (ptr) & comp, TRUE);
- if (tokenBAT[INDEX]->T->hash == NULL ||
- BATcount(tokenBAT[INDEX]) > 4 * tokenBAT[INDEX]->T->hash->mask)
{
- HASHdestroy(tokenBAT[INDEX]);
- BAThash(BATmirror(tokenBAT[INDEX]), 2 *
BATcount(tokenBAT[INDEX]));
+ BUNappend(tokenBAT[INDEX].val, (ptr) & comp, TRUE);
+ if (tokenBAT[INDEX].val->T->hash == NULL ||
+ BATcount(tokenBAT[INDEX].val) > 4 *
tokenBAT[INDEX].val->T->hash->mask) {
+ HASHdestroy(tokenBAT[INDEX].val);
+ BAThash(BATmirror(tokenBAT[INDEX].val), 2 *
BATcount(tokenBAT[INDEX].val));
}
GDKfree(url);
@@ -427,18 +490,18 @@ TKNZRlocate(Client cntxt, MalBlkPtr mb,
} else if (depth > tokenDepth) {
pos = oid_nil;
} else {
- p = BUNfnd(BATmirror(tokenBAT[0]), parts[0]);
+ p = BUNfnd(BATmirror(tokenBAT[0].val), parts[0]);
if (p != BUN_NONE) {
prv = (oid) p;
for (i = 1; i < depth; i++) {
- if (!prvlocate(tokenBAT[i], (ptr) & prv,
parts[i]))
+ if (!prvlocate(tokenBAT[i].val,
tokenBAT[i].idx, (ptr) & prv, parts[i]))
break;
}
if (i < depth) {
pos = oid_nil;
} else {
comp = COMP(prv, i);
- pos = BUNfnd(BATmirror(tokenBAT[INDEX]), (ptr)
& comp);
+ pos = BUNfnd(BATmirror(tokenBAT[INDEX].val),
(ptr) & comp);
}
} else {
pos = oid_nil;
@@ -457,20 +520,22 @@ takeOid(oid id, str *val)
str parts[MAX_TKNZR_DEPTH];
size_t lngth = 0;
str s;
+ BATiter biidx; //Iterator for index bat
- if (id >= BATcount(tokenBAT[INDEX])) {
+ if (id >= BATcount(tokenBAT[INDEX].val)) {
throw(MAL, "tokenizer.takeOid", OPERATION_FAILED " illegal
oid");
}
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list