Changeset: 146b0a7a1b66 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=146b0a7a1b66 Added Files: monetdb5/extras/rdf/30_rdf.mal monetdb5/extras/rdf/rdfalgebra.c monetdb5/extras/rdf/rdfalgebra.mal Modified Files: .hgtags monetdb5/extras/rdf/Makefile.ag monetdb5/extras/rdf/rdf.h monetdb5/extras/rdf/rdf_shredder.mx monetdb5/modules/mal/tokenizer.c monetdb5/modules/mal/tokenizer.h monetdb5/modules/mal/tokenizer.mal sql/backends/monet5/sql.mx Branch: default Log Message:
Merged lodrdf branch into default lodrdf has landed on default diffs (truncated from 1245 to 300 lines): diff --git a/.hgtags b/.hgtags --- a/.hgtags +++ b/.hgtags @@ -502,3 +502,4 @@ f21bc494423c56e9cb68bae0b400e1f2a6d0c2bc f8e913e56b99223764b1c22f526087d6bcad2656 Jul2012_SP2_release 2d08000a18ddbe8f2cc3f12e09b6def8e53ae53a Oct2012_1 4233be3c7f49f704e5ed466d2c1836c9dcfbb9f7 Oct2012_release +d762ae2e83bd42a48aebf0ebb051a643f77f77d7 Lod2_D2_5_Deliverable_MonetDBRDF diff --git a/monetdb5/extras/rdf/30_rdf.mal b/monetdb5/extras/rdf/30_rdf.mal new file mode 100644 --- /dev/null +++ b/monetdb5/extras/rdf/30_rdf.mal @@ -0,0 +1,20 @@ +# The contents of this file are subject to the MonetDB Public License +# Version 1.1 (the "License"); you may not use this file except in +# compliance with the License. You may obtain a copy of the License at +# http://www.monetdb.org/Legal/MonetDBLicense +# +# Software distributed under the License is distributed on an "AS IS" +# basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the +# License for the specific language governing rights and limitations +# under the License. +# +# The Original Code is the MonetDB Database System. +# +# The Initial Developer of the Original Code is CWI. +# Portions created by CWI are Copyright (C) 1997-July 2008 CWI. +# Copyright August 2008-2012 MonetDB B.V. +# All Rights Reserved. + +# This loads the MonetDB/RDF module +include tokenizer; +include rdfalgebra; diff --git a/monetdb5/extras/rdf/Makefile.ag b/monetdb5/extras/rdf/Makefile.ag --- a/monetdb5/extras/rdf/Makefile.ag +++ b/monetdb5/extras/rdf/Makefile.ag @@ -16,6 +16,7 @@ # All Rights Reserved. INCLUDES = ../../modules/atoms ../../modules/mal ../../mal \ + ../../modules/kernel \ ../../../clients/mapilib \ ../../../common/options \ ../../../common/stream \ @@ -25,7 +26,26 @@ INCLUDES = ../../modules/atoms ../../mod MTSAFE lib_rdf = { - NOINST + MODULE DIR = libdir/monetdb5 - SOURCES = rdf_shredder.mx rdf.h + SOURCES = rdf.h rdf_shredder.mx rdfalgebra.c + + LIBS = ../../tools/libmonetdb5 \ + ../../../gdk/libbat \ + $(MALLOC_LIBS) $(raptor_LIBS) } + +headers_rdf_mal = { + HEADERS = mal + DIR = libdir/monetdb5 + SOURCES = rdfalgebra.mal +} + +headers_autoload = { + HEADERS = mal + DIR = libdir/monetdb5/autoload + SOURCES = 30_rdf.mal +} + +#EXTRA_DIST_DIR = Tests +EXTRA_DIST = 30_rdf.mal rdfalgebra.mal diff --git a/monetdb5/extras/rdf/rdf.h b/monetdb5/extras/rdf/rdf.h --- a/monetdb5/extras/rdf/rdf.h +++ b/monetdb5/extras/rdf/rdf.h @@ -27,6 +27,8 @@ #ifndef _RDF_H_ #define _RDF_H_ +#include <gdk.h> + #ifdef WIN32 #ifndef LIBRDF #define rdf_export extern __declspec(dllimport) @@ -41,7 +43,13 @@ #define _RDF_DEBUG rdf_export str -RDFParser(int *retval, str *location, str *graphname, str *schemam); +RDFParser(BAT **graph, str *location, str *graphname, str *schemam); + +rdf_export str +RDFleftfetchjoin_sortedestimate(int *result, int *lid, int *rid, lng *estimate); +rdf_export str +RDFleftfetchjoin_sorted(int *result, int* lid, int *rid); + #define TRIPLE_STORE 1 #define MLA_STORE 2 diff --git a/monetdb5/extras/rdf/rdf_shredder.mx b/monetdb5/extras/rdf/rdf_shredder.mx --- a/monetdb5/extras/rdf/rdf_shredder.mx +++ b/monetdb5/extras/rdf/rdf_shredder.mx @@ -92,7 +92,7 @@ typedef struct parserData { int line; /* locator for errors */ int column; /* locator for errors */ /**GRAPH DATA */ - BAT *graph[N_GRAPH_BAT]; /* BATs for the result + BAT **graph; /* BATs for the result shredded RDF graph */ } parserData; @@ -146,7 +146,7 @@ static void @= rdf_BUNappend_unq_1 bun = BUNfnd(BATmirror(@1),(ptr)@2); if (bun == BUN_NONE) { - if (BATcount(@1) > 4 * @1->T->hash->mask) { + if (@1->T->hash && BATcount(@1) > 4 * @1->T->hash->mask) { HASHdestroy(@1); BAThash(BATmirror(@1), 2*BATcount(@1)); } @@ -255,7 +255,6 @@ create_BAT(int ht, int tt, int size) /* disable all properties */ b->tsorted = FALSE; - b->trevsorted = FALSE; b->tdense = FALSE; b->tkey = FALSE; b->hdense = TRUE; @@ -264,7 +263,7 @@ create_BAT(int ht, int tt, int size) } static parserData* -parserData_create (str location) +parserData_create (str location, BAT** graph) { int i; @@ -277,8 +276,9 @@ parserData_create (str location) pdata->error = 0; pdata->warning = 0; pdata->location = location; + pdata->graph = graph; - for (i = 0; i < N_GRAPH_BAT; i++) { + for (i = 0; i <= N_GRAPH_BAT; i++) { pdata->graph[i] = NULL; } @@ -306,7 +306,7 @@ parserData_create (str location) return NULL; } /* MAP_LEX must have the key property */ - BATseqbase(pdata->graph[MAP_LEX], 1 << 30); + BATseqbase(pdata->graph[MAP_LEX], RDF_MIN_LITERAL); pdata->graph[MAP_LEX]->tkey = BOUND2BTRUE; pdata->graph[MAP_LEX]->T->nokey[0] = 0; pdata->graph[MAP_LEX]->T->nokey[1] = 0; @@ -329,7 +329,7 @@ parserData_create (str location) * example, it holds that S_SPO == S_SOP. */ @= order -@:order2(@1,@2,@3,@4)@ +@:order3(@1,@2,@3,@4)@ @= order2 if (!CTrefine(&ctref, @1, @2)) /* refine @2 given @1= sorted */ @@ -347,6 +347,7 @@ BBPcold(graph[@3_@4]->batCacheid); /* free ctref */ BBPreclaim(ctref); @ + @= order3 if ( !(CTrefine(&map_oid, @1, @2) /* refine @3 given @1= sorted */ && CTrefine(&ctref, map_oid, @3)))/* refine @4 given @3 */ @@ -385,19 +386,19 @@ post_processing (parserData *pdata) /* order MAP_LEX */ BATorder(BATmirror(graph[MAP_LEX])); - map_oid = BATmark(graph[MAP_LEX], 1<<30); /* BATmark will create a copy */ + map_oid = BATmark(graph[MAP_LEX], RDF_MIN_LITERAL); /* BATmark will create a copy */ BATorder(map_oid); BATsetaccess(map_oid, BAT_READ); /* force BAtmark not to copy bat */ - map_oid = BATmirror(BATmark(BATmirror(map_oid), 1<<30)); + map_oid = BATmirror(BATmark(BATmirror(map_oid), RDF_MIN_LITERAL)); BATsetaccess(graph[MAP_LEX], BAT_READ); /* force BATmark not to copy bat */ - graph[MAP_LEX] = BATmirror(BATmark(BATmirror(graph[MAP_LEX]), 1<<30)); + graph[MAP_LEX] = BATmirror(BATmark(BATmirror(graph[MAP_LEX]), RDF_MIN_LITERAL)); /* convert old oids of O_sort to new ones */ bi = bat_iterator(graph[O_sort]); mi = bat_iterator(map_oid); BATloop(graph[O_sort], p, d) { bt = (oid *) BUNtloc(bi, p); - if (*bt >= (1 << 30)) { + if (*bt >= (RDF_MIN_LITERAL)) { BUNfndVOID(r, mi, bt); void_inplace(graph[O_sort], p, BUNtloc(mi, r), 1); } @@ -484,6 +485,7 @@ raptor_free_parser(rparser); raptor_free_uri(uri); raptor_finish(); @ + @= clean if (pdata != NULL) { for (iret = 0; iret < N_GRAPH_BAT; iret++) { @@ -494,9 +496,11 @@ if (pdata != NULL) { } @ @c +#define RDF_CHUNK_SIZE 100*1024*1024 + /* Main RDF parser function that drives raptor */ str -RDFParser (int *retval, str *location, str *graphname, str *schema) +RDFParser (BAT **graph, str *location, str *graphname, str *schema) { raptor_parser *rparser; parserData *pdata; @@ -504,8 +508,6 @@ RDFParser (int *retval, str *location, s bit isURI; str ret; int iret; - BAT **graph; - BAT *retbat; (void) graphname; /* init tokenizer */ @@ -519,7 +521,7 @@ RDFParser (int *retval, str *location, s #endif /* Init pdata */ - pdata = parserData_create(*location); + pdata = parserData_create(*location,graph); if (pdata == NULL) { #ifdef _TKNZR_H TKNZRclose(&iret); @@ -555,16 +557,38 @@ RDFParser (int *retval, str *location, s uri = raptor_new_uri((unsigned char *) pdata->location); iret = raptor_parse_uri(rparser, uri, NULL); } else { - uri = raptor_new_uri( - raptor_uri_filename_to_uri_string(pdata->location)); - iret = raptor_parse_file(rparser, uri, NULL); + + /* Too slow loading --> use old code + FILE *fp = fopen(pdata->location, "r"); + char *buf = (char*) GDKmalloc(RDF_CHUNK_SIZE); + if (buf == NULL) { + throw(RDF, "rdf.rdfShred", + "could not allocate a %dMB file buffer\n", (int) (RDF_CHUNK_SIZE>>20)); + } + uri = raptor_new_uri(raptor_uri_filename_to_uri_string(pdata->location)); + iret = raptor_start_parse(rparser, uri); + while(fp && iret == 0) { + ssize_t len = (ssize_t) fread(buf, 1, RDF_CHUNK_SIZE, fp); + iret = raptor_parse_chunk(rparser, (const unsigned char*) buf, (size_t) len, len < RDF_CHUNK_SIZE); + } + fclose(fp); + + */ + + /* does/may? not work on large files -- therefore the abpove chunked read + iret = raptor_parse_file_stream(rparser, fp, pdata->location, uri); + */ + + /* Old code */ + uri = raptor_new_uri( + raptor_uri_filename_to_uri_string(pdata->location)); + iret = raptor_parse_file(rparser, uri, NULL); } @:clean_raptor@ #ifdef _TKNZR_H TKNZRclose(&iret); #endif - graph = pdata->graph; assert (pdata->tcount == BATcount(graph[S_sort]) && pdata->tcount == BATcount(graph[P_sort]) && pdata->tcount == BATcount(graph[O_sort])); @@ -593,25 +617,6 @@ RDFParser (int *retval, str *location, s @:clean@ throw(RDF, "rdf.rdfShred", "could not post-proccess data"); } - - /* prepare return bat of bats */ -/* XXX: BAT columns of TYPE_bat are no longer allowed: this function - * needs to be rewritten to return multiple BATs instead of a single - * BAT-of-batS */ - retbat = BATnew(TYPE_void, TYPE_bat, N_GRAPH_BAT); - if (retbat == NULL) { - @:clean@ - throw(RDF, "rdf.rdfShred", - "could not allocate enough memory for return bat"); - } - BATseqbase(retbat, 0); - for (iret = 0; iret < N_GRAPH_BAT; iret++) { - retbat = BUNappend(retbat, &graph[iret]->batCacheid, TRUE); - BBPunfix(graph[iret]->batCacheid); _______________________________________________ checkin-list mailing list [email protected] http://mail.monetdb.org/mailman/listinfo/checkin-list
