Changeset: 0c05e6360d8b for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=0c05e6360d8b
Modified Files:
monetdb5/extras/rdf/rdf.h
monetdb5/extras/rdf/rdf_shredder.c
monetdb5/extras/rdf/rdfparser.h
monetdb5/extras/rdf/rdfschema.c
monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:
Analyze the FK cardinality.
diffs (truncated from 384 to 300 lines):
diff --git a/monetdb5/extras/rdf/rdf.h b/monetdb5/extras/rdf/rdf.h
--- a/monetdb5/extras/rdf/rdf.h
+++ b/monetdb5/extras/rdf/rdf.h
@@ -73,6 +73,7 @@ RDFpartialjoin (bat *res, bat *lmap, bat
#define batsz 10000000
#define smallbatsz 100000
+#define smallHashBatsz 10000
#if STORE == TRIPLE_STORE
typedef enum {
diff --git a/monetdb5/extras/rdf/rdf_shredder.c
b/monetdb5/extras/rdf/rdf_shredder.c
--- a/monetdb5/extras/rdf/rdf_shredder.c
+++ b/monetdb5/extras/rdf/rdf_shredder.c
@@ -247,6 +247,9 @@ getObjectType(unsigned char* objStr, BUN
static void
tripleHandler(void* user_data, const raptor_statement* triple)
{
+#if CHECK_NUM_DBPONTOLOGY
+ const char* pos = NULL;
+#endif
parserData *pdata = ((parserData *) user_data);
BUN bun = BUN_NONE;
BUN realNumValue = BUN_NONE;
@@ -284,6 +287,11 @@ tripleHandler(void* user_data, const rap
if (triple->predicate->type == RAPTOR_TERM_TYPE_URI) {
unsigned char* predicateStr;
predicateStr = raptor_term_to_string(triple->predicate);
+ #if CHECK_NUM_DBPONTOLOGY
+ if ( (pos = strstr((str)predicateStr ,
"http://dbpedia.org/ontology")) != NULL){
+ pdata->numOntologyTriples++;
+ }
+ #endif
//rdf_insert(pdata, graph[MAP_LEX], (str) predicateStr,
&bun);
rdf_tknzr_insert((str) predicateStr, &bun);
rdf_BUNappend(pdata, graph[P_sort], &bun);
@@ -377,6 +385,9 @@ parserData_create (str location, BAT** g
pdata->warning = 0;
pdata->location = location;
pdata->graph = graph;
+#if CHECK_NUM_DBPONTOLOGY
+ pdata->numOntologyTriples = 0;
+#endif
for (i = 0; i <= N_GRAPH_BAT; i++) {
pdata->graph[i] = NULL;
@@ -719,7 +730,11 @@ RDFParser (BAT **graph, str *location, s
uri =
raptor_new_uri(world,raptor_uri_filename_to_uri_string(tmpfilename));
iret =
raptor_parser_parse_file(rparser, uri, NULL);
raptor_free_uri(uri);
- printf(".. Done \n");
+ #if
CHECK_NUM_DBPONTOLOGY
+ printf(".. Done (No
errors: %d | Loaded: " BUNFMT " | Ontology-based: %d) \n",pdata->error,
pdata->tcount, pdata->numOntologyTriples);
+ #else
+ printf(".. Done (No
errors: %d | Loaded: " BUNFMT ") \n",pdata->error, pdata->tcount);
+ #endif
}
}
closedir (dp);
@@ -779,6 +794,11 @@ RDFParser (BAT **graph, str *location, s
}
#else
+ #if CHECK_NUM_DBPONTOLOGY
+ printf("Total number of triples loaded: " BUNFMT " (Number of available
ontology-based triples: %d) \n", pdata->tcount, pdata->numOntologyTriples);
+ #else
+ printf("Total number of triples loaded: " BUNFMT "\n", pdata->tcount);
+ #endif
printf("Total number of error %d , fatal %d , warning %d",
pdata->error, pdata->fatal, pdata->warning);
#endif
/* post processing step */
diff --git a/monetdb5/extras/rdf/rdfparser.h b/monetdb5/extras/rdf/rdfparser.h
--- a/monetdb5/extras/rdf/rdfparser.h
+++ b/monetdb5/extras/rdf/rdfparser.h
@@ -29,7 +29,7 @@
#include <raptor2.h>
-
+#define CHECK_NUM_DBPONTOLOGY 0 /* Check how many rdf triples use
dbpontology */
typedef struct parserData {
/**PROPERTIES */
@@ -51,6 +51,10 @@ typedef struct parserData {
/**GRAPH DATA */
BAT **graph; /* BATs for the result
shredded RDF graph */
+#if CHECK_NUM_DBPONTOLOGY
+ int numOntologyTriples;
+#endif
+
} parserData;
void fatalHandler (void *user_data, raptor_log_message* message);
diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -621,6 +621,9 @@ void initCSPropTypes(CSPropTypes* csProp
csPropTypes[id].lstPropTypes[j].defColIdx = -1;
csPropTypes[id].lstPropTypes[j].isFKProp = 0;
csPropTypes[id].lstPropTypes[j].refTblId = -1;
+ csPropTypes[id].lstPropTypes[j].refTblSupport =
0;
+ csPropTypes[id].lstPropTypes[j].numReferring =
0;
+ csPropTypes[id].lstPropTypes[j].numDisRefValues
= 0;
csPropTypes[id].lstPropTypes[j].isDirtyFKProp =
0;
csPropTypes[id].lstPropTypes[j].lstTypes =
(char*)GDKmalloc(sizeof(char) * csPropTypes[id].lstPropTypes[j].numType);
csPropTypes[id].lstPropTypes[j].lstFreq =
(int*)GDKmalloc(sizeof(int) * csPropTypes[id].lstPropTypes[j].numType);
@@ -5298,6 +5301,9 @@ CSrel* getFKBetweenTableSet(CSrel *csrel
//Add rel info to csPropTypes
csPropTypes[from].lstPropTypes[propIdx].isFKProp = 1;
csPropTypes[from].lstPropTypes[propIdx].refTblId = to;
+ csPropTypes[from].lstPropTypes[propIdx].refTblSupport =
freqCSset->items[toFreqId].support;
+ csPropTypes[from].lstPropTypes[propIdx].numReferring =
rel.lstCnt[j];
+
}
}
@@ -5476,6 +5482,59 @@ str printFKs(CSrel *csRelFinalFKs, int f
return MAL_SUCCEED;
}
+
+static
+void printFKMultiplicityFromCSPropTypes(CSPropTypes* csPropTypes, int
numMergedCS, int freqThreshold){
+ char filename[100];
+ char tmpStr[50];
+ FILE *fout;
+ int i, j;
+
+ int numMtoM = 0; //Many To Many
+ int numOtoM = 0; //One to Many
+ int numOtoO = 0;
+
+ printf("Collect the statistic for FK multiplicity ... ");
+ strcpy(filename, "FKMultiplicity");
+ sprintf(tmpStr, "%d", freqThreshold);
+ strcat(filename, tmpStr);
+ strcat(filename, ".txt");
+
+ fout = fopen(filename,"wt");
+
+ /* Print cspropTypes */
+
+ fprintf(fout, "FromTbl PropId Prop ToTbl isMultiProp
PropCoverage RefferedTblSupport NumReffering NumReferred
Ratio\n");
+ for (i = 0; i < numMergedCS; i++){
+ for(j = 0; j < csPropTypes[i].numProp; j++){
+ if (csPropTypes[i].lstPropTypes[j].isFKProp){
+ if
(csPropTypes[i].lstPropTypes[j].numDisRefValues == 0) continue; // These
columns may be put into PSO, thus no FK
+
+ fprintf(fout, "%d %d "BUNFMT"
%d %d %d %d %d %d %f\n",
+ i , j,
csPropTypes[i].lstPropTypes[j].prop, csPropTypes[i].lstPropTypes[j].refTblId,
+
csPropTypes[i].lstPropTypes[j].isMVProp,
csPropTypes[i].lstPropTypes[j].propCover,
+
csPropTypes[i].lstPropTypes[j].refTblSupport,
csPropTypes[i].lstPropTypes[j].numReferring,
+
csPropTypes[i].lstPropTypes[j].numDisRefValues,
+
(float)csPropTypes[i].lstPropTypes[j].numReferring /
csPropTypes[i].lstPropTypes[j].numDisRefValues
+ );
+ if (csPropTypes[i].lstPropTypes[j].numReferring
== csPropTypes[i].lstPropTypes[j].numDisRefValues) numOtoO++;
+ if (csPropTypes[i].lstPropTypes[j].isMVProp)
numMtoM++;
+ if (csPropTypes[i].lstPropTypes[j].numReferring
> csPropTypes[i].lstPropTypes[j].numDisRefValues) numOtoM++;
+ }
+ }
+
+ }
+
+ printf("Done!\n");
+
+ printf("There are %d One to Many FKs\n", numOtoM);
+ printf("There are %d Many to Many FKs\n", numMtoM);
+ printf("There are %d One to One FKs\n", numOtoO);
+
+ fclose(fout);
+
+}
+
// for storing ontology data
oid **ontattributes = NULL;
int ontattributesCount = 0;
@@ -6339,6 +6398,7 @@ void getRealValue(ValPtr returnValue, oi
VALset(returnValue, TYPE_str, tmpStr);
break;
case DATETIME:
+ //printf("A Datetime object value: %s \n",objStr);
datetimeStr = getDateTimeFromRDFString(objStr);
VALset(returnValue, TYPE_str, datetimeStr);
break;
@@ -6406,6 +6466,12 @@ str RDFdistTriplesToCSs(int *ret, bat *s
int numPKcols = 0;
char isPossiblePK = 0;
#endif
+ #if COUNT_DISTINCT_REFERRED_S
+ BAT *tmpFKHashBat = NULL;
+ int initHashBatgz = 0;
+ BUN tmpFKRefBun = BUN_NONE;
+ char isFKCol = 0;
+ #endif
if (TKNZRopen (NULL, &schema) != MAL_SUCCEED) {
throw(RDF, "RDFdistTriplesToCSs",
@@ -6499,6 +6565,7 @@ str RDFdistTriplesToCSs(int *ret, bat *s
tmpPropIdx = tmpTblIdxPropIdxMap[tblIdx];
+ //printf(" PropIdx = %d \n", tmpPropIdx);
tmpColIdx =
csPropTypes[tblIdx].lstPropTypes[tmpPropIdx].defColIdx;
if (tmpColIdx == -1){ // This col is removed as an infrequent
prop
BUNappend(cstablestat->pbat,pbt , TRUE);
@@ -6528,6 +6595,9 @@ str RDFdistTriplesToCSs(int *ret, bat *s
}
//printf(" Tbl: %d | Col: %d \n", tblIdx, tmpColIdx);
+ #if COUNT_DISTINCT_REFERRED_S
+ isFKCol =
csPropTypes[tblIdx].lstPropTypes[tmpPropIdx].isFKProp;
+ #endif
istmpMVProp =
csPropTypes[tblIdx].lstPropTypes[tmpPropIdx].isMVProp;
defaultType =
csPropTypes[tblIdx].lstPropTypes[tmpPropIdx].defaultType;
@@ -6546,10 +6616,19 @@ str RDFdistTriplesToCSs(int *ret, bat *s
if (isPossiblePK){
tmpHashBat = BATnew(TYPE_void, TYPE_oid,
lastSubjId[tblIdx] + 1);
(void)BATprepareHash(BATmirror(tmpHashBat));
+ BUNappend(tmpHashBat,obt, TRUE);
//Insert the first value
isCheckDone = 0;
numPKcols++;
}
#endif
+ #if COUNT_DISTINCT_REFERRED_S
+ if (isFKCol){
+ initHashBatgz =
(csPropTypes[tblIdx].lstPropTypes[tmpPropIdx].refTblSupport >
smallHashBatsz)?smallHashBatsz:csPropTypes[tblIdx].lstPropTypes[tmpPropIdx].refTblSupport;
+ tmpFKHashBat = BATnew(TYPE_void, TYPE_oid,
initHashBatgz + 1);
+ (void)BATprepareHash(BATmirror(tmpFKHashBat));
+ BUNappend(tmpFKHashBat,obt, TRUE);
//The first value
+ }
+ #endif
isSetLasttblIdx = 1;
}
@@ -6560,11 +6639,30 @@ str RDFdistTriplesToCSs(int *ret, bat *s
//Insert missing values for all columns of this
property in this table
fillMissingvaluesAll(cstablestat, csPropTypes,
lasttblIdx, lastColIdx, lastPropIdx, lastSubjId);
+
+ #if COUNT_DISTINCT_REFERRED_S
+ if
(csPropTypes[lasttblIdx].lstPropTypes[lastPropIdx].isFKProp ) {
+ //printf("Update refcount for FK Col at: Table
%d Prop %d (Orig Ref size: %d) --> " BUNFMT "\n", lasttblIdx, lastPropIdx,
csPropTypes[lasttblIdx].lstPropTypes[lastPropIdx].refTblSupport,
BATcount(tmpFKHashBat));
+
csPropTypes[lasttblIdx].lstPropTypes[lastPropIdx].numDisRefValues =
BATcount(tmpFKHashBat);
+ if (tmpFKHashBat != NULL){
+ BBPreclaim(tmpFKHashBat);
+ tmpFKHashBat = NULL;
+ }
+ }
+ if (isFKCol){
+ initHashBatgz =
(csPropTypes[tblIdx].lstPropTypes[tmpPropIdx].refTblSupport >
smallHashBatsz)?smallHashBatsz:csPropTypes[tblIdx].lstPropTypes[tmpPropIdx].refTblSupport;
+ tmpFKHashBat = BATnew(TYPE_void, TYPE_oid,
initHashBatgz + 1);
+ (void)BATprepareHash(BATmirror(tmpFKHashBat));
+ BUNappend(tmpFKHashBat,obt, TRUE);
//The first value
+ }
+ #endif
+
lastColIdx = tmpColIdx;
lastPropIdx = tmpPropIdx;
lasttblIdx = tblIdx;
tmplastInsertedS = -1;
cstablestat->lastInsertedS[tblIdx][tmpColIdx] =
BUN_NONE;
+
#if DETECT_PKCOL
if (isPossiblePK){
if (tmpHashBat != NULL){
@@ -6574,14 +6672,17 @@ str RDFdistTriplesToCSs(int *ret, bat *s
tmpHashBat = BATnew(TYPE_void, TYPE_oid,
lastSubjId[tblIdx] + 1);
(void)BATprepareHash(BATmirror(tmpHashBat));
csPropTypes[tblIdx].lstPropTypes[tmpPropIdx].isPKProp = 1; /* Assume that the
object values are all unique*/
+ BUNappend(tmpHashBat,obt, TRUE);
//Insert the first value
isCheckDone = 0;
numPKcols++;
}
#endif
+
}
- #if DETECT_PKCOL
else{
+
+ #if DETECT_PKCOL
if (isCheckDone == 0 && isPossiblePK){
tmpObjBun = BUNfnd(BATmirror(tmpHashBat),(ptr)
obt);
if (tmpObjBun == BUN_NONE){
@@ -6594,8 +6695,17 @@ str RDFdistTriplesToCSs(int *ret, bat *s
//printf("Found duplicated value at "
BUNFMT " | " BUNFMT " | " BUNFMT "\n", *pbt, *sbt, *obt);
}
}
- }
- #endif
+
+ #endif
+ #if COUNT_DISTINCT_REFERRED_S
+ if (isFKCol){
+ tmpFKRefBun =
BUNfnd(BATmirror(tmpFKHashBat),(ptr) obt);
+ if (tmpFKRefBun == BUN_NONE){
+ BUNappend(tmpFKHashBat,obt, TRUE);
+ }
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list