Changeset: 0c05e6360d8b for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=0c05e6360d8b
Modified Files:
        monetdb5/extras/rdf/rdf.h
        monetdb5/extras/rdf/rdf_shredder.c
        monetdb5/extras/rdf/rdfparser.h
        monetdb5/extras/rdf/rdfschema.c
        monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:

Analyze the FK cardinality.


diffs (truncated from 384 to 300 lines):

diff --git a/monetdb5/extras/rdf/rdf.h b/monetdb5/extras/rdf/rdf.h
--- a/monetdb5/extras/rdf/rdf.h
+++ b/monetdb5/extras/rdf/rdf.h
@@ -73,6 +73,7 @@ RDFpartialjoin (bat *res, bat *lmap, bat
 
 #define batsz 10000000
 #define smallbatsz 100000
+#define smallHashBatsz 10000
 
 #if STORE == TRIPLE_STORE
  typedef enum {
diff --git a/monetdb5/extras/rdf/rdf_shredder.c 
b/monetdb5/extras/rdf/rdf_shredder.c
--- a/monetdb5/extras/rdf/rdf_shredder.c
+++ b/monetdb5/extras/rdf/rdf_shredder.c
@@ -247,6 +247,9 @@ getObjectType(unsigned char* objStr, BUN
 static void 
 tripleHandler(void* user_data, const raptor_statement* triple)
 {
+#if CHECK_NUM_DBPONTOLOGY
+       const char* pos = NULL;
+#endif
        parserData *pdata = ((parserData *) user_data);
        BUN bun = BUN_NONE;
        BUN realNumValue = BUN_NONE; 
@@ -284,6 +287,11 @@ tripleHandler(void* user_data, const rap
                if (triple->predicate->type == RAPTOR_TERM_TYPE_URI) {
                        unsigned char* predicateStr;
                        predicateStr = raptor_term_to_string(triple->predicate);
+                       #if CHECK_NUM_DBPONTOLOGY
+                       if ( (pos = strstr((str)predicateStr , 
"http://dbpedia.org/ontology";)) != NULL){
+                               pdata->numOntologyTriples++;
+                       }
+                       #endif
                        //rdf_insert(pdata, graph[MAP_LEX], (str) predicateStr, 
&bun);
                        rdf_tknzr_insert((str) predicateStr, &bun);
                        rdf_BUNappend(pdata, graph[P_sort], &bun); 
@@ -377,6 +385,9 @@ parserData_create (str location, BAT** g
        pdata->warning = 0;
        pdata->location = location;
        pdata->graph = graph;
+#if CHECK_NUM_DBPONTOLOGY
+       pdata->numOntologyTriples = 0;
+#endif
 
        for (i = 0; i <= N_GRAPH_BAT; i++) {
                pdata->graph[i] = NULL;
@@ -719,7 +730,11 @@ RDFParser (BAT **graph, str *location, s
                                                        uri = 
raptor_new_uri(world,raptor_uri_filename_to_uri_string(tmpfilename));
                                                        iret = 
raptor_parser_parse_file(rparser, uri, NULL);
                                                        raptor_free_uri(uri);
-                                                       printf(".. Done \n");
+                                                       #if 
CHECK_NUM_DBPONTOLOGY
+                                                       printf(".. Done (No 
errors: %d | Loaded: " BUNFMT " | Ontology-based: %d) \n",pdata->error, 
pdata->tcount, pdata->numOntologyTriples);
+                                                       #else
+                                                       printf(".. Done (No 
errors: %d | Loaded: " BUNFMT ") \n",pdata->error, pdata->tcount);
+                                                       #endif
                                                }
                                        }
                                        closedir (dp);
@@ -779,6 +794,11 @@ RDFParser (BAT **graph, str *location, s
        }
 
 #else
+       #if CHECK_NUM_DBPONTOLOGY
+       printf("Total number of triples loaded: " BUNFMT " (Number of available 
ontology-based triples: %d) \n", pdata->tcount, pdata->numOntologyTriples);
+       #else
+       printf("Total number of triples loaded: " BUNFMT "\n", pdata->tcount);
+       #endif
        printf("Total number of error %d , fatal %d , warning %d", 
pdata->error, pdata->fatal, pdata->warning);
 #endif
        /* post processing step */
diff --git a/monetdb5/extras/rdf/rdfparser.h b/monetdb5/extras/rdf/rdfparser.h
--- a/monetdb5/extras/rdf/rdfparser.h
+++ b/monetdb5/extras/rdf/rdfparser.h
@@ -29,7 +29,7 @@
 
 #include <raptor2.h>
 
-
+#define CHECK_NUM_DBPONTOLOGY   0       /* Check how many rdf triples use 
dbpontology */
 
 typedef struct parserData {
                                      /**PROPERTIES             */
@@ -51,6 +51,10 @@ typedef struct parserData {
                                      /**GRAPH DATA             */
        BAT **graph;                  /* BATs for the result
                                         shredded RDF graph     */
+#if CHECK_NUM_DBPONTOLOGY
+       int numOntologyTriples; 
+#endif
+       
 } parserData;
 
 void fatalHandler (void *user_data, raptor_log_message* message);
diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -621,6 +621,9 @@ void initCSPropTypes(CSPropTypes* csProp
                                csPropTypes[id].lstPropTypes[j].defColIdx = -1; 
                                csPropTypes[id].lstPropTypes[j].isFKProp = 0;
                                csPropTypes[id].lstPropTypes[j].refTblId = -1; 
+                               csPropTypes[id].lstPropTypes[j].refTblSupport = 
0;
+                               csPropTypes[id].lstPropTypes[j].numReferring = 
0;
+                               csPropTypes[id].lstPropTypes[j].numDisRefValues 
= 0;
                                csPropTypes[id].lstPropTypes[j].isDirtyFKProp = 
0; 
                                csPropTypes[id].lstPropTypes[j].lstTypes = 
(char*)GDKmalloc(sizeof(char) * csPropTypes[id].lstPropTypes[j].numType);
                                csPropTypes[id].lstPropTypes[j].lstFreq = 
(int*)GDKmalloc(sizeof(int) * csPropTypes[id].lstPropTypes[j].numType);
@@ -5298,6 +5301,9 @@ CSrel* getFKBetweenTableSet(CSrel *csrel
                        //Add rel info to csPropTypes
                        csPropTypes[from].lstPropTypes[propIdx].isFKProp = 1; 
                        csPropTypes[from].lstPropTypes[propIdx].refTblId = to;
+                       csPropTypes[from].lstPropTypes[propIdx].refTblSupport = 
freqCSset->items[toFreqId].support;
+                       csPropTypes[from].lstPropTypes[propIdx].numReferring = 
rel.lstCnt[j];
+
 
                }
        }
@@ -5476,6 +5482,59 @@ str printFKs(CSrel *csRelFinalFKs, int f
        return MAL_SUCCEED; 
 }
 
+
+static 
+void printFKMultiplicityFromCSPropTypes(CSPropTypes* csPropTypes, int 
numMergedCS, int freqThreshold){
+       char filename[100]; 
+       char tmpStr[50]; 
+       FILE *fout; 
+       int i, j; 
+       
+       int     numMtoM = 0;    //Many To Many
+       int     numOtoM = 0;    //One to Many
+       int     numOtoO = 0; 
+
+       printf("Collect the statistic for FK multiplicity ... ");
+       strcpy(filename, "FKMultiplicity");
+       sprintf(tmpStr, "%d", freqThreshold);
+       strcat(filename, tmpStr);
+       strcat(filename, ".txt");
+
+       fout = fopen(filename,"wt"); 
+
+       /* Print cspropTypes */
+
+       fprintf(fout, "FromTbl  PropId  Prop    ToTbl   isMultiProp     
PropCoverage    RefferedTblSupport      NumReffering    NumReferred     
Ratio\n");
+       for (i = 0; i < numMergedCS; i++){
+               for(j = 0; j < csPropTypes[i].numProp; j++){
+                       if (csPropTypes[i].lstPropTypes[j].isFKProp){
+                               if 
(csPropTypes[i].lstPropTypes[j].numDisRefValues == 0) continue; // These 
columns may be put into PSO, thus no FK
+
+                               fprintf(fout, "%d       %d      "BUNFMT"        
%d      %d      %d      %d      %d      %d      %f\n",
+                                               i , j, 
csPropTypes[i].lstPropTypes[j].prop, csPropTypes[i].lstPropTypes[j].refTblId, 
+                                               
csPropTypes[i].lstPropTypes[j].isMVProp, 
csPropTypes[i].lstPropTypes[j].propCover, 
+                                               
csPropTypes[i].lstPropTypes[j].refTblSupport, 
csPropTypes[i].lstPropTypes[j].numReferring,
+                                               
csPropTypes[i].lstPropTypes[j].numDisRefValues, 
+                                               
(float)csPropTypes[i].lstPropTypes[j].numReferring / 
csPropTypes[i].lstPropTypes[j].numDisRefValues
+                                               );
+                               if (csPropTypes[i].lstPropTypes[j].numReferring 
== csPropTypes[i].lstPropTypes[j].numDisRefValues) numOtoO++;
+                               if  (csPropTypes[i].lstPropTypes[j].isMVProp)   
numMtoM++;
+                               if (csPropTypes[i].lstPropTypes[j].numReferring 
> csPropTypes[i].lstPropTypes[j].numDisRefValues) numOtoM++;    
+                       }
+               }
+
+       }
+       
+       printf("Done!\n"); 
+       
+       printf("There are %d One to Many FKs\n", numOtoM);
+       printf("There are %d Many to Many FKs\n", numMtoM);
+       printf("There are %d One to One FKs\n", numOtoO);
+
+       fclose(fout); 
+
+}
+
 // for storing ontology data
 oid    **ontattributes = NULL;
 int    ontattributesCount = 0;
@@ -6339,6 +6398,7 @@ void getRealValue(ValPtr returnValue, oi
                        VALset(returnValue, TYPE_str, tmpStr);
                        break; 
                case DATETIME:
+                       //printf("A Datetime object value: %s \n",objStr);
                        datetimeStr = getDateTimeFromRDFString(objStr);
                        VALset(returnValue, TYPE_str, datetimeStr);
                        break; 
@@ -6406,6 +6466,12 @@ str RDFdistTriplesToCSs(int *ret, bat *s
        int     numPKcols = 0; 
        char    isPossiblePK = 0; 
        #endif
+       #if     COUNT_DISTINCT_REFERRED_S
+       BAT     *tmpFKHashBat = NULL;
+       int     initHashBatgz = 0; 
+       BUN     tmpFKRefBun = BUN_NONE; 
+       char    isFKCol = 0; 
+       #endif
        
        if (TKNZRopen (NULL, &schema) != MAL_SUCCEED) {
                throw(RDF, "RDFdistTriplesToCSs",
@@ -6499,6 +6565,7 @@ str RDFdistTriplesToCSs(int *ret, bat *s
 
 
                tmpPropIdx = tmpTblIdxPropIdxMap[tblIdx]; 
+               //printf(" PropIdx = %d \n", tmpPropIdx);
                tmpColIdx = 
csPropTypes[tblIdx].lstPropTypes[tmpPropIdx].defColIdx; 
                if (tmpColIdx == -1){   // This col is removed as an infrequent 
prop
                        BUNappend(cstablestat->pbat,pbt , TRUE);
@@ -6528,6 +6595,9 @@ str RDFdistTriplesToCSs(int *ret, bat *s
                }
 
                //printf(" Tbl: %d   |   Col: %d \n", tblIdx, tmpColIdx);
+               #if COUNT_DISTINCT_REFERRED_S
+               isFKCol = 
csPropTypes[tblIdx].lstPropTypes[tmpPropIdx].isFKProp; 
+               #endif
                
                istmpMVProp = 
csPropTypes[tblIdx].lstPropTypes[tmpPropIdx].isMVProp; 
                defaultType = 
csPropTypes[tblIdx].lstPropTypes[tmpPropIdx].defaultType; 
@@ -6546,10 +6616,19 @@ str RDFdistTriplesToCSs(int *ret, bat *s
                        if (isPossiblePK){
                                tmpHashBat = BATnew(TYPE_void, TYPE_oid, 
lastSubjId[tblIdx] + 1);
                                (void)BATprepareHash(BATmirror(tmpHashBat));
+                               BUNappend(tmpHashBat,obt, TRUE);                
//Insert the first value
                                isCheckDone = 0; 
                                numPKcols++;
                        }
                        #endif
+                       #if COUNT_DISTINCT_REFERRED_S
+                       if (isFKCol){
+                               initHashBatgz = 
(csPropTypes[tblIdx].lstPropTypes[tmpPropIdx].refTblSupport > 
smallHashBatsz)?smallHashBatsz:csPropTypes[tblIdx].lstPropTypes[tmpPropIdx].refTblSupport;
+                               tmpFKHashBat = BATnew(TYPE_void, TYPE_oid, 
initHashBatgz + 1);
+                               (void)BATprepareHash(BATmirror(tmpFKHashBat));
+                               BUNappend(tmpFKHashBat,obt, TRUE);              
//The first value
+                       }
+                       #endif
                        isSetLasttblIdx = 1; 
                }
 
@@ -6560,11 +6639,30 @@ str RDFdistTriplesToCSs(int *ret, bat *s
                        //Insert missing values for all columns of this 
property in this table
 
                        fillMissingvaluesAll(cstablestat, csPropTypes, 
lasttblIdx, lastColIdx, lastPropIdx, lastSubjId);
+                               
+                       #if COUNT_DISTINCT_REFERRED_S
+                       if 
(csPropTypes[lasttblIdx].lstPropTypes[lastPropIdx].isFKProp ) {
+                               //printf("Update refcount for FK Col at: Table 
%d  Prop %d (Orig Ref size: %d) --> " BUNFMT "\n", lasttblIdx, lastPropIdx, 
csPropTypes[lasttblIdx].lstPropTypes[lastPropIdx].refTblSupport, 
BATcount(tmpFKHashBat)); 
+                               
csPropTypes[lasttblIdx].lstPropTypes[lastPropIdx].numDisRefValues = 
BATcount(tmpFKHashBat);
+                               if (tmpFKHashBat != NULL){
+                                       BBPreclaim(tmpFKHashBat);
+                                       tmpFKHashBat = NULL; 
+                               }
+                       }
+                       if (isFKCol){
+                               initHashBatgz = 
(csPropTypes[tblIdx].lstPropTypes[tmpPropIdx].refTblSupport > 
smallHashBatsz)?smallHashBatsz:csPropTypes[tblIdx].lstPropTypes[tmpPropIdx].refTblSupport;
+                               tmpFKHashBat = BATnew(TYPE_void, TYPE_oid, 
initHashBatgz + 1);
+                               (void)BATprepareHash(BATmirror(tmpFKHashBat));
+                               BUNappend(tmpFKHashBat,obt, TRUE);              
//The first value
+                       }
+                       #endif
+
                        lastColIdx = tmpColIdx; 
                        lastPropIdx = tmpPropIdx; 
                        lasttblIdx = tblIdx;
                        tmplastInsertedS = -1;
                        cstablestat->lastInsertedS[tblIdx][tmpColIdx] = 
BUN_NONE;
+
                        #if     DETECT_PKCOL    
                        if (isPossiblePK){
                                if (tmpHashBat != NULL){
@@ -6574,14 +6672,17 @@ str RDFdistTriplesToCSs(int *ret, bat *s
                                tmpHashBat = BATnew(TYPE_void, TYPE_oid, 
lastSubjId[tblIdx] + 1);
                                (void)BATprepareHash(BATmirror(tmpHashBat));
                                
csPropTypes[tblIdx].lstPropTypes[tmpPropIdx].isPKProp = 1;  /* Assume that the 
object values are all unique*/
+                               BUNappend(tmpHashBat,obt, TRUE);                
//Insert the first value
                                isCheckDone = 0;
                                numPKcols++;
                        }
                        #endif
+
                        
                }
-               #if     DETECT_PKCOL
                else{
+
+                       #if     DETECT_PKCOL
                        if (isCheckDone == 0 && isPossiblePK){
                                tmpObjBun = BUNfnd(BATmirror(tmpHashBat),(ptr) 
obt);
                                if (tmpObjBun == BUN_NONE){
@@ -6594,8 +6695,17 @@ str RDFdistTriplesToCSs(int *ret, bat *s
                                        //printf("Found duplicated value at " 
BUNFMT "  |  " BUNFMT " | " BUNFMT "\n", *pbt, *sbt, *obt);
                                }
                        }
-               }
-               #endif
+
+                       #endif
+                       #if COUNT_DISTINCT_REFERRED_S
+                       if (isFKCol){
+                               tmpFKRefBun = 
BUNfnd(BATmirror(tmpFKHashBat),(ptr) obt);
+                               if (tmpFKRefBun == BUN_NONE){
+                                       BUNappend(tmpFKHashBat,obt, TRUE);
+                               }
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to