Changeset: 88c8fdf4e227 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=88c8fdf4e227
Modified Files:
monetdb5/extras/rdf/rdf_shredder.mx
sql/backends/monet5/sql.mx
Branch: lodrdf
Log Message:
Fix bug in SQLrdfShred() function (in sql/backends/monet5/sql.mx) while loading
rdf data
Add the corresponding modification for this in rdf_shredder.mx
diffs (truncated from 1087 to 300 lines):
diff --git a/monetdb5/extras/rdf/rdf_shredder.mx
b/monetdb5/extras/rdf/rdf_shredder.mx
--- a/monetdb5/extras/rdf/rdf_shredder.mx
+++ b/monetdb5/extras/rdf/rdf_shredder.mx
@@ -92,15 +92,15 @@ typedef struct parserData {
int line; /* locator for errors */
int column; /* locator for errors */
/**GRAPH DATA */
- BAT *graph[N_GRAPH_BAT]; /* BATs for the result
+ BAT **graph; /* BATs for the result
shredded RDF graph */
} parserData;
-/*
- * @-
- * The (fatal) errors and warnings produced by the raptor parser are handled
- * by the next three message handler functions.
- */
+@{
+@-
+The (fatal) errors and warnings produced by the raptor parser are handled
+by the next three message handler functions.
+
@= raptor_exception
@1->exception++;
@1->exceptionMsg = @2;
@@ -124,17 +124,17 @@ static void
}
}
-@
+@}
+
@c
@:rdf_parser_handler(fatal)@
@:rdf_parser_handler(error)@
@:rdf_parser_handler(warning)@
-/*
- * @-
- * The raptor parser needs to register a callback function that handles one
triple
- * at a time. Function rdf_parser_triple_handler() does exactly this.
- */
+@-
+The raptor parser needs to register a callback function that handles one triple
+at a time. Function rdf_parser_triple_handler() does exactly this.
+
@= rdf_insert
#ifdef _TKNZR_H
@:rdf_tknzr_insert(@2)@
@@ -146,7 +146,7 @@ static void
@= rdf_BUNappend_unq_1
bun = BUNfnd(BATmirror(@1),(ptr)@2);
if (bun == BUN_NONE) {
- if (BATcount(@1) > 4 * @1->T->hash->mask) {
+ if (@1->T->hash && BATcount(@1) > 4 * @1->T->hash->mask) {
HASHdestroy(@1);
BAThash(BATmirror(@1), 2*BATcount(@1));
}
@@ -185,7 +185,6 @@ if (@1 == NULL) {
@:raptor_exception(pdata, "could not append in@1")@
}
-@
@c
static void
tripleHandler(void* user_data, const raptor_statement* triple)
@@ -229,10 +228,9 @@ tripleHandler(void* user_data, const rap
return;
}
-/*
- * @-
- * Function RDFParser() is the entry point to parse an RDF document.
- */
+@-
+Function RDFParser() is the entry point to parse an RDF document.
+
@= set_handlers
/* set callback handler for triples */
raptor_set_statement_handler (@1, @2, tripleHandler);
@@ -241,7 +239,6 @@ raptor_set_fatal_error_handler (@1, @2,
raptor_set_error_handler (@1, @2, errorHandler);
raptor_set_warning_handler (@1, @2, warningHandler);
-@
@c
/* creates a BAT for the triple table */
static BAT*
@@ -255,7 +252,6 @@ create_BAT(int ht, int tt, int size)
/* disable all properties */
b->tsorted = FALSE;
- b->trevsorted = FALSE;
b->tdense = FALSE;
b->tkey = FALSE;
b->hdense = TRUE;
@@ -264,7 +260,7 @@ create_BAT(int ht, int tt, int size)
}
static parserData*
-parserData_create (str location)
+parserData_create (str location, BAT** graph)
{
int i;
@@ -277,8 +273,9 @@ parserData_create (str location)
pdata->error = 0;
pdata->warning = 0;
pdata->location = location;
+ pdata->graph = graph;
- for (i = 0; i < N_GRAPH_BAT; i++) {
+ for (i = 0; i <= N_GRAPH_BAT; i++) {
pdata->graph[i] = NULL;
}
@@ -306,7 +303,7 @@ parserData_create (str location)
return NULL;
}
/* MAP_LEX must have the key property */
- BATseqbase(pdata->graph[MAP_LEX], 1 << 30);
+ BATseqbase(pdata->graph[MAP_LEX], RDF_MIN_LITERAL);
pdata->graph[MAP_LEX]->tkey = BOUND2BTRUE;
pdata->graph[MAP_LEX]->T->nokey[0] = 0;
pdata->graph[MAP_LEX]->T->nokey[1] = 0;
@@ -314,22 +311,21 @@ parserData_create (str location)
return pdata;
}
-/*
- * @-
- * After the RDF document has been shredded into 3 bats and a lexical value
- * dictionary, a post-shred processing step follows that orders the lexical
- * dictionary, re-maps oids to match the ordered dictionary and finaly creates
- * all 6 permutations of the (subject, predicate, object) order.
- *
- * However, it is still to be examined if it worth the time to refine the order
- * of the last column. In most cases, during query time, the last column will
need
- * to be re-order for a subsequent sort-merge join. We introduce sort3 and
sort2
- * so we can investigate both possibilities. In addition, the first column
need to
- * be stored only once for each couple of orders with the same first column.
For
- * example, it holds that S_SPO == S_SOP.
- */
+@-
+After the RDF document has been shredded into 3 bats and a lexical value
+dictionary, a post-shred processing step follows that orders the lexical
+dictionary, re-maps oids to match the ordered dictionary and finaly creates
+all 6 permutations of the (subject, predicate, object) order.
+
+However, it is still to be examined if it worth the time to refine the order
+of the last column. In most cases, during query time, the last column will need
+to be re-order for a subsequent sort-merge join. We introduce sort3 and sort2
+so we can investigate both possibilities. In addition, the first column need to
+be stored only once for each couple of orders with the same first column. For
+example, it holds that S_SPO == S_SOP.
+
@= order
-@:order2(@1,@2,@3,@4)@
+@:order3(@1,@2,@3,@4)@
@= order2
if (!CTrefine(&ctref, @1, @2)) /* refine @2 given @1= sorted */
@@ -347,6 +343,7 @@ BBPcold(graph[@3_@4]->batCacheid);
/* free ctref */
BBPreclaim(ctref);
@
+
@= order3
if ( !(CTrefine(&map_oid, @1, @2) /* refine @3 given @1= sorted */
&& CTrefine(&ctref, map_oid, @3)))/* refine @4 given @3
*/
@@ -366,6 +363,7 @@ BBPcold(graph[@3_@4]->batCacheid);
/* free map_oid */
BBPreclaim(map_oid);
@
+
@c
int CTrefine(BAT **ret, BAT *b, BAT *a); /* from modules/kernel/group.mx */
@@ -385,19 +383,19 @@ post_processing (parserData *pdata)
/* order MAP_LEX */
BATorder(BATmirror(graph[MAP_LEX]));
- map_oid = BATmark(graph[MAP_LEX], 1<<30); /* BATmark will create a
copy */
+ map_oid = BATmark(graph[MAP_LEX], RDF_MIN_LITERAL); /* BATmark will
create a copy */
BATorder(map_oid);
BATsetaccess(map_oid, BAT_READ); /* force BAtmark not to copy
bat */
- map_oid = BATmirror(BATmark(BATmirror(map_oid), 1<<30));
+ map_oid = BATmirror(BATmark(BATmirror(map_oid), RDF_MIN_LITERAL));
BATsetaccess(graph[MAP_LEX], BAT_READ); /* force BATmark not to copy
bat */
- graph[MAP_LEX] = BATmirror(BATmark(BATmirror(graph[MAP_LEX]), 1<<30));
+ graph[MAP_LEX] = BATmirror(BATmark(BATmirror(graph[MAP_LEX]),
RDF_MIN_LITERAL));
/* convert old oids of O_sort to new ones */
bi = bat_iterator(graph[O_sort]);
mi = bat_iterator(map_oid);
BATloop(graph[O_sort], p, d) {
bt = (oid *) BUNtloc(bi, p);
- if (*bt >= (1 << 30)) {
+ if (*bt >= (RDF_MIN_LITERAL)) {
BUNfndVOID(r, mi, bt);
void_inplace(graph[O_sort], p, BUNtloc(mi, r), 1);
}
@@ -484,6 +482,7 @@ raptor_free_parser(rparser);
raptor_free_uri(uri);
raptor_finish();
@
+
@= clean
if (pdata != NULL) {
for (iret = 0; iret < N_GRAPH_BAT; iret++) {
@@ -493,10 +492,13 @@ if (pdata != NULL) {
GDKfree(pdata);
}
@
+
@c
+#define RDF_CHUNK_SIZE 100*1024*1024
+
/* Main RDF parser function that drives raptor */
str
-RDFParser (int *retval, str *location, str *graphname, str *schema)
+RDFParser (BAT **graph, str *location, str *graphname, str *schema)
{
raptor_parser *rparser;
parserData *pdata;
@@ -504,8 +506,6 @@ RDFParser (int *retval, str *location, s
bit isURI;
str ret;
int iret;
- BAT **graph;
- BAT *retbat;
(void) graphname;
/* init tokenizer */
@@ -519,7 +519,7 @@ RDFParser (int *retval, str *location, s
#endif
/* Init pdata */
- pdata = parserData_create(*location);
+ pdata = parserData_create(*location,graph);
if (pdata == NULL) {
#ifdef _TKNZR_H
TKNZRclose(&iret);
@@ -555,16 +555,38 @@ RDFParser (int *retval, str *location, s
uri = raptor_new_uri((unsigned char *) pdata->location);
iret = raptor_parse_uri(rparser, uri, NULL);
} else {
- uri = raptor_new_uri(
-
raptor_uri_filename_to_uri_string(pdata->location));
- iret = raptor_parse_file(rparser, uri, NULL);
+
+ /* Too slow loading --> use old code
+ FILE *fp = fopen(pdata->location, "r");
+ char *buf = (char*) GDKmalloc(RDF_CHUNK_SIZE);
+ if (buf == NULL) {
+ throw(RDF, "rdf.rdfShred",
+ "could not allocate a %dMB file buffer\n",
(int) (RDF_CHUNK_SIZE>>20));
+ }
+ uri =
raptor_new_uri(raptor_uri_filename_to_uri_string(pdata->location));
+ iret = raptor_start_parse(rparser, uri);
+ while(fp && iret == 0) {
+ ssize_t len = (ssize_t) fread(buf, 1, RDF_CHUNK_SIZE,
fp);
+ iret = raptor_parse_chunk(rparser, (const unsigned
char*) buf, (size_t) len, len < RDF_CHUNK_SIZE);
+ }
+ fclose(fp);
+
+ */
+
+ /* does/may? not work on large files -- therefore the abpove
chunked read
+ iret = raptor_parse_file_stream(rparser, fp,
pdata->location, uri);
+ */
+
+ /* Old code */
+ uri = raptor_new_uri(
+
raptor_uri_filename_to_uri_string(pdata->location));
+ iret = raptor_parse_file(rparser, uri, NULL);
}
@:clean_raptor@
#ifdef _TKNZR_H
TKNZRclose(&iret);
#endif
- graph = pdata->graph;
assert (pdata->tcount == BATcount(graph[S_sort]) &&
pdata->tcount == BATcount(graph[P_sort]) &&
pdata->tcount == BATcount(graph[O_sort]));
@@ -593,25 +615,6 @@ RDFParser (int *retval, str *location, s
@:clean@
throw(RDF, "rdf.rdfShred", "could not post-proccess data");
}
-
- /* prepare return bat of bats */
-/* XXX: BAT columns of TYPE_bat are no longer allowed: this function
- * needs to be rewritten to return multiple BATs instead of a single
- * BAT-of-batS */
- retbat = BATnew(TYPE_void, TYPE_bat, N_GRAPH_BAT);
- if (retbat == NULL) {
- @:clean@
- throw(RDF, "rdf.rdfShred",
- "could not allocate enough memory for return
bat");
_______________________________________________
checkin-list mailing list
[email protected]
http://mail.monetdb.org/mailman/listinfo/checkin-list