Changeset: 08f8e8ed02e0 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=08f8e8ed02e0
Modified Files:
monetdb5/extras/rdf/rdfschema.mal
monetdb5/modules/mal/tokenizer.c
Branch: rdf
Log Message:
Modify the tokenizer using only void head BATs
diffs (truncated from 372 to 300 lines):
diff --git a/monetdb5/extras/rdf/rdfschema.mal
b/monetdb5/extras/rdf/rdfschema.mal
--- a/monetdb5/extras/rdf/rdfschema.mal
+++ b/monetdb5/extras/rdf/rdfschema.mal
@@ -17,7 +17,7 @@
module rdf;
-io.print("RDFschemaExplore mal loaded");
+#io.print("RDFschemaExplore mal loaded");
command rdfschemaexplore(tbname:str, clname:str ) :void
address RDFSchemaExplore
comment "Explore the schema information from input table e.g., SPO in RDF";
diff --git a/monetdb5/modules/mal/tokenizer.c b/monetdb5/modules/mal/tokenizer.c
--- a/monetdb5/modules/mal/tokenizer.c
+++ b/monetdb5/modules/mal/tokenizer.c
@@ -56,7 +56,11 @@
#define MAX_TKNZR_DEPTH 256
#define INDEX MAX_TKNZR_DEPTH
static int tokenDepth = 0;
-static BAT *tokenBAT[MAX_TKNZR_DEPTH + 1];
+//static BAT *tokenBAT[MAX_TKNZR_DEPTH + 1];
+struct {
+ BAT *idx, *val;
+} tokenBAT[MAX_TKNZR_DEPTH + 1];
+
static BAT *TRANS = NULL; /* the catalog of tokenizers */
static char name[128];
@@ -70,16 +74,18 @@ static char name[128];
#define GET_d(x) ((sht) ((x) & 255))
#define GET_h(x) ((x) >> 8)
-static int prvlocate(BAT* b, oid *prv, str part)
+static int prvlocate(BAT* b, BAT* bidx, oid *prv, str part)
{
BAT *m = BATmirror(b);
BATiter mi = bat_iterator(m);
+ BATiter biidx = bat_iterator(bidx);
+
BUN p;
if (m->H->hash == NULL)
BAThash(m, 2 * BATcount(m));
HASHloop_str(mi, m->H->hash, p, part)
{
- if (*((oid *) BUNtail(mi, p)) == *prv) {
+ if (*((oid *) BUNtail(biidx, p)) == *prv) {
*prv = (oid) p;
return TRUE;
}
@@ -107,7 +113,9 @@ TKNZRopen(int *ret, str *in)
}
for (depth = 0; depth < MAX_TKNZR_DEPTH; depth++) {
- tokenBAT[depth] = 0;
+ //tokenBAT[depth] = 0;
+ tokenBAT[depth].idx = 0;
+ tokenBAT[depth].val = 0;
}
tokenDepth = 0;
@@ -134,14 +142,17 @@ TKNZRopen(int *ret, str *in)
throw(MAL, "tokenizer.open", MAL_MALLOC_FAIL);
BATkey(b, FALSE);
BATseqbase(b, 0);
- tokenBAT[INDEX] = b;
+ //tokenBAT[INDEX] = b;
+ tokenBAT[INDEX].val = b;
if (BKCsetName(&r, (int *) &(b->batCacheid), (str *) &batname)
!= MAL_SUCCEED)
throw(MAL, "tokenizer.open", OPERATION_FAILED);
if (BKCsetPersistent(&r, (int *) &(b->batCacheid)) !=
MAL_SUCCEED)
throw(MAL, "tokenizer.open", OPERATION_FAILED);
BUNappend(TRANS, batname, FALSE);
} else { /* existing tokenizer */
- tokenBAT[INDEX] = BATdescriptor(idx);
+ //tokenBAT[INDEX] = BATdescriptor(idx);
+ tokenBAT[INDEX].val = BATdescriptor(idx);
+
BUNappend(TRANS, batname, FALSE);
for (depth = 0; depth < MAX_TKNZR_DEPTH; depth++) {
@@ -149,8 +160,17 @@ TKNZRopen(int *ret, str *in)
idx = BBPindex(batname);
if (idx == 0)
break;
- tokenBAT[depth] = BATdescriptor(idx);
+ tokenBAT[depth].val = BATdescriptor(idx);
BUNappend(TRANS, batname, FALSE);
+
+ //For idx BATs
+ snprintf(batname, 132, "%s_idx_%d", name, depth);
+ idx = BBPindex(batname);
+ if (idx == 0)
+ break;
+ tokenBAT[depth].idx = BATdescriptor(idx);
+ BUNappend(TRANS, batname, FALSE);
+
}
tokenDepth = depth;
}
@@ -171,10 +191,11 @@ TKNZRclose(int *r)
TMsubcommit(TRANS);
for (i = 0; i < tokenDepth; i++) {
- BBPunfix(tokenBAT[i]->batCacheid);
+ BBPunfix(tokenBAT[i].idx->batCacheid);
+ BBPunfix(tokenBAT[i].val->batCacheid);
}
- BBPunfix(tokenBAT[INDEX]->batCacheid);
-
+ //BBPunfix(tokenBAT[INDEX].idx->batCacheid);
+ BBPunfix(tokenBAT[INDEX].val->batCacheid);
tokenDepth = 0;
BBPreclaim(TRANS);
@@ -217,7 +238,8 @@ TKNZRappend(oid *pos, str *s)
str batname;
str parts[MAX_TKNZR_DEPTH];
int i, new, r, depth;
- BAT *b;
+ BAT *bVal;
+ BAT *bIdx;
BUN p;
BUN idx = 0;
oid prv = 0;
@@ -242,44 +264,76 @@ TKNZRappend(oid *pos, str *s)
throw(MAL, "tokenizer",
ILLEGAL_ARGUMENT "input string breaks to too
many parts");
}
- if (depth > tokenDepth || tokenBAT[0] == NULL) {
+ if (depth > tokenDepth || tokenBAT[0].val == NULL) {
new = tokenDepth;
for (i = tokenDepth; i < depth; i++) {
- /* make new bat */
- batname = (str) GDKmalloc(128 * sizeof(char));
+ /* make new bat for value */
+ batname = (str) GDKmalloc(132 * sizeof(char));
snprintf(batname, 128, "%s_%d", name, i);
- b = BATnew(TYPE_oid, TYPE_str, 1024);
- if (b == NULL) {
+ bVal = BATnew(TYPE_void, TYPE_str, 1024);
+ if (bVal == NULL) {
GDKfree(batname);
GDKfree(url);
throw(MAL, "tokenizer.append", MAL_MALLOC_FAIL);
}
- BATkey(b, FALSE);
- tokenBAT[i] = b;
+ BATkey(bVal, FALSE);
+ BATseqbase(bVal, 0);
+
+ tokenBAT[i].val = bVal;
- if (BKCsetName(&r, (int *) &(b->batCacheid), (str *)
&batname)
+ if (BKCsetName(&r, (int *) &(bVal->batCacheid), (str *)
&batname)
!= MAL_SUCCEED) {
GDKfree(batname);
GDKfree(url);
throw(MAL, "tokenizer.open", OPERATION_FAILED);
}
- if (BKCsetPersistent(&r, (int *) &(b->batCacheid))
+ if (BKCsetPersistent(&r, (int *) &(bVal->batCacheid))
!= MAL_SUCCEED) {
GDKfree(batname);
GDKfree(url);
throw(MAL, "tokenizer.open", OPERATION_FAILED);
}
BUNappend(TRANS, batname, FALSE);
+
+ /* make new bat for index */
+ snprintf(batname, 132, "%s_idx_%d", name, i);
+ bIdx = BATnew(TYPE_void, TYPE_oid, 1024);
+ if (bIdx == NULL) {
+ GDKfree(batname);
+ GDKfree(url);
+ throw(MAL, "tokenizer.append", MAL_MALLOC_FAIL);
+ }
+ BATkey(bIdx, FALSE);
+ BATseqbase(bIdx, 0);
+
+ tokenBAT[i].idx = bIdx;
+
+ if (BKCsetName(&r, (int *) &(bIdx->batCacheid), (str *)
&batname)
+ != MAL_SUCCEED) {
+ GDKfree(batname);
+ GDKfree(url);
+ throw(MAL, "tokenizer.open", OPERATION_FAILED);
+ }
+ if (BKCsetPersistent(&r, (int *) &(bIdx->batCacheid))
+ != MAL_SUCCEED) {
+ GDKfree(batname);
+ GDKfree(url);
+ throw(MAL, "tokenizer.open", OPERATION_FAILED);
+ }
+ BUNappend(TRANS, batname, FALSE);
+
+
GDKfree(batname);
}
tokenDepth = depth;
}
+
/* findcommn */
- p = BUNfnd(BATmirror(tokenBAT[0]), parts[0]);
+ p = BUNfnd(BATmirror(tokenBAT[0].val), parts[0]);
if (p != BUN_NONE) {
prv = (oid) p;
for (i = 1; i < new; i++) {
- if (!prvlocate(tokenBAT[i], &prv, parts[i]))
+ if (!prvlocate(tokenBAT[i].val, tokenBAT[i].idx, &prv,
parts[i]))
break;
}
} else {
@@ -288,9 +342,10 @@ TKNZRappend(oid *pos, str *s)
if (i == depth) {
comp = COMP(prv, depth);
- *pos = BUNfnd(BATmirror(tokenBAT[INDEX]), (ptr) & comp);
+ *pos = BUNfnd(BATmirror(tokenBAT[INDEX].val), (ptr) & comp);
if (*pos != BUN_NONE) {
/* the string is already there */
+ //printf("The string %s is already there",url);
GDKfree(url);
return MAL_SUCCEED;
}
@@ -298,33 +353,41 @@ TKNZRappend(oid *pos, str *s)
/* insremainder */
for (; i < depth; i++) {
- idx = BATcount(tokenBAT[i]);
+ idx = BATcount(tokenBAT[i].val);
if (idx > MAX_h) {
GDKfree(url);
throw(MAL, "tokenizer.append",
OPERATION_FAILED " no more free oid's");
}
- tokenBAT[i] = BUNins(tokenBAT[i], (ptr) & prv, parts[i], FALSE);
- if (tokenBAT[i] == NULL) {
+ tokenBAT[i].val = BUNappend(tokenBAT[i].val, parts[i], TRUE);
+ if (tokenBAT[i].val == NULL) {
GDKfree(url);
throw(MAL, "tokenizer.append",
OPERATION_FAILED " could not append");
}
- if (tokenBAT[i]->T->hash == NULL ||
- BATcount(tokenBAT[i]) > 4 * tokenBAT[i]->T->hash->mask)
{
- HASHdestroy(tokenBAT[i]);
- BAThash(BATmirror(tokenBAT[i]), 2 *
BATcount(tokenBAT[i]));
+ if (tokenBAT[i].val->T->hash == NULL ||
+ BATcount(tokenBAT[i].val) > 4 *
tokenBAT[i].val->T->hash->mask) {
+ HASHdestroy(tokenBAT[i].val);
+ BAThash(BATmirror(tokenBAT[i].val), 2 *
BATcount(tokenBAT[i].val));
}
+
+ tokenBAT[i].idx = BUNappend(tokenBAT[i].idx, (ptr) & prv, TRUE);
+ if (tokenBAT[i].idx == NULL) {
+ GDKfree(url);
+ throw(MAL, "tokenizer.append",
+ OPERATION_FAILED " could not append");
+ }
+
prv = (oid) idx;
}
- *pos = (oid) BATcount(tokenBAT[INDEX]);
+ *pos = (oid) BATcount(tokenBAT[INDEX].val);
comp = COMP(prv, depth);
- BUNappend(tokenBAT[INDEX], (ptr) & comp, TRUE);
- if (tokenBAT[INDEX]->T->hash == NULL ||
- BATcount(tokenBAT[INDEX]) > 4 * tokenBAT[INDEX]->T->hash->mask)
{
- HASHdestroy(tokenBAT[INDEX]);
- BAThash(BATmirror(tokenBAT[INDEX]), 2 *
BATcount(tokenBAT[INDEX]));
+ BUNappend(tokenBAT[INDEX].val, (ptr) & comp, TRUE);
+ if (tokenBAT[INDEX].val->T->hash == NULL ||
+ BATcount(tokenBAT[INDEX].val) > 4 *
tokenBAT[INDEX].val->T->hash->mask) {
+ HASHdestroy(tokenBAT[INDEX].val);
+ BAThash(BATmirror(tokenBAT[INDEX].val), 2 *
BATcount(tokenBAT[INDEX].val));
}
GDKfree(url);
@@ -427,18 +490,18 @@ TKNZRlocate(Client cntxt, MalBlkPtr mb,
} else if (depth > tokenDepth) {
pos = oid_nil;
} else {
- p = BUNfnd(BATmirror(tokenBAT[0]), parts[0]);
+ p = BUNfnd(BATmirror(tokenBAT[0].val), parts[0]);
if (p != BUN_NONE) {
prv = (oid) p;
for (i = 1; i < depth; i++) {
- if (!prvlocate(tokenBAT[i], (ptr) & prv,
parts[i]))
+ if (!prvlocate(tokenBAT[i].val,
tokenBAT[i].idx, (ptr) & prv, parts[i]))
break;
}
if (i < depth) {
pos = oid_nil;
} else {
comp = COMP(prv, i);
- pos = BUNfnd(BATmirror(tokenBAT[INDEX]), (ptr)
& comp);
+ pos = BUNfnd(BATmirror(tokenBAT[INDEX].val),
(ptr) & comp);
}
} else {
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list