Changeset: 60cf81b9ceb5 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=60cf81b9ceb5
Added Files:
        monetdb5/extras/rdf/rdfgraph.c
        monetdb5/extras/rdf/rdfgraph.h
Modified Files:
        monetdb5/extras/rdf/Makefile.ag
        monetdb5/extras/rdf/rdfschema.c
        monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:

Modify the algorithm for identifying dimension table.

- The number of PR iterations is computed based on the diameter of the graph 
formed by CS's and their relationships

- The threshold of the dimension table IR score is greater than pow(5, number 
of iterations)


diffs (truncated from 365 to 300 lines):

diff --git a/monetdb5/extras/rdf/Makefile.ag b/monetdb5/extras/rdf/Makefile.ag
--- a/monetdb5/extras/rdf/Makefile.ag
+++ b/monetdb5/extras/rdf/Makefile.ag
@@ -32,7 +32,7 @@ lib__rdf = {
        #MODULE
        NOINST
        #DIR = libdir/monetdb5
-       SOURCES = rdf.h rdftypes.h rdfschema.h rdfminheap.h rdfminheap.c 
rdflabels.h rdfretrieval.h rdfparser.h rdftypes.c rdfparser.c rdfontologyload.h 
rdfontologyload.c rdf_shredder.c rdfalgebra.c rdfschema.c rdflabels.c 
rdfretrieval.c  
+       SOURCES = rdf.h rdftypes.h rdfschema.h rdfgraph.h rdfgraph.c 
rdfminheap.h rdfminheap.c rdflabels.h rdfretrieval.h rdfparser.h rdftypes.c 
rdfparser.c rdfontologyload.h rdfontologyload.c rdf_shredder.c rdfalgebra.c 
rdfschema.c rdflabels.c rdfretrieval.c  
 
        #SEP = _
        # LIBS =  ./hashmap/librdfhash  
diff --git a/monetdb5/extras/rdf/rdfgraph.c b/monetdb5/extras/rdf/rdfgraph.c
new file mode 100644
--- /dev/null
+++ b/monetdb5/extras/rdf/rdfgraph.c
@@ -0,0 +1,178 @@
+/*
+ * The contents of this file are subject to the MonetDB Public License
+ * Version 1.1 (the "License"); you may not use this file except in
+ * compliance with the License. You may obtain a copy of the License at
+ * http://www.monetdb.org/Legal/MonetDBLicense
+ *
+ * Software distributed under the License is distributed on an "AS IS"
+ * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+ * License for the specific language governing rights and limitations
+ * under the License.
+ *
+ * The Original Code is the MonetDB Database System.
+ *
+ * The Initial Developer of the Original Code is CWI.
+ * Portions created by CWI are Copyright (C) 1997-July 2008 CWI.
+ * Copyright August 2008-2013 MonetDB B.V.
+ * All Rights Reserved.
+ */
+
+/* This contains graph algorithms for the graph formed by CS's and their 
relationships */
+
+#include "monetdb_config.h"
+#include "mal_exception.h"
+#include "url.h"
+#include "tokenizer.h"
+#include <gdk.h>
+#include <rdf.h>
+#include <rdfschema.h>
+#include <rdfgraph.h>
+
+
+
+static
+void visit(int curV, CSrel *csrelSet, char *visited, int* d, int level, int* 
maxd, int* maxV, char* globalVisited){
+       int j; 
+       int nextV; 
+
+       visited[curV] = 1; 
+       globalVisited[curV] = 1; 
+       d[curV] = level; 
+       
+       //Update and store the value with max distance
+       if (d[curV] > *maxd){
+               *maxd = d[curV];
+               *maxV = curV; 
+       }
+       
+       //We use this condition for preventing the time consumption in
+       //finding the graph diameter. Because, even though the graph 
+       //diameter may be large, we will not run more than 10 iterations
+       //for detecting the dimension table.
+       //
+       if (*maxd > MAX_ITERATION_NO){          
+               return; 
+       }
+
+       if (csrelSet[curV].numRef != 0){
+               for  (j = 0; j < csrelSet[curV].numRef; j++){
+                       nextV = csrelSet[curV].lstRefFreqIdx[j];
+
+                       if (!visited[nextV]) visit(nextV, csrelSet, visited, d, 
level + 1, maxd, maxV, globalVisited); 
+               }
+       }
+}
+
+static
+void bfs(int numV, int start, CSrel *csrelSet, int* maxd, int* maxV, char* 
globalVisited){
+       
+       char* visited; 
+       int i;
+       int* d;         //distances
+
+       visited = (char*)GDKmalloc(sizeof(char) * numV); 
+       d = (int*)GDKmalloc(sizeof(int) * numV); 
+       
+       for (i = 0; i < numV; i++){
+               visited[i] = 0;
+       }
+       
+       *maxd = -1; 
+
+       visit(start,csrelSet,visited, d, 0, maxd, maxV, globalVisited); 
+
+       GDKfree(visited); 
+       GDKfree(d); 
+}
+
+
+//This function is implemented according to 
http://link.springer.com/chapter/10.1007%2F11764298_9
+//k: Number of vertices
+//
+int getDiameter(int k, int numV,CSrel *csrelSet){
+       int i; 
+       int maxd = -1;
+       int maxV = -1;
+       int startV = -1;
+
+       int unCheckV = 0; 
+       int nConnectGraph = 0;
+
+       char* globalVisited = NULL;     //To handle the case when the graph is 
not connected
+       int globalMaxd = 0; 
+
+       //init
+       globalVisited = (char*) GDKmalloc(sizeof(char) * numV); 
+       for (i = 0; i < numV; i++){
+               globalVisited[i] = 0;
+       }
+       
+       unCheckV = 0;
+       //Go through each connected graph
+       for (unCheckV = 0; unCheckV < numV; unCheckV++){
+               if (globalVisited[unCheckV] || csrelSet[unCheckV].numRef == 0) 
continue;
+               
+               //For each connected graph, run k times 
+               nConnectGraph++;
+               printf("Start with connected graph %d from node %d\n", 
nConnectGraph,unCheckV);
+
+               for (i = 0; i < k; i++){
+                       if (i == 0) startV = unCheckV;
+                       else{
+                               startV = maxV;  
+                       }
+                       
+                       bfs(numV,startV, csrelSet, &maxd, &maxV, globalVisited);
+                       //printf("Max distance after the %d bfs is %d (from %d 
to %d)\n ", i, maxd, startV, maxV);
+                       if (maxd > globalMaxd) globalMaxd = maxd;
+               }
+
+               if (globalMaxd > MAX_ITERATION_NO)
+                       break; 
+       }               
+
+
+       GDKfree(globalVisited); 
+
+       printf("Diameter is %d\n",globalMaxd);
+       return globalMaxd;      
+}
+
+int getDiameterExact(int numV,CSrel *csrelSet){
+       int i; 
+       int maxd = -1;
+       int maxV = -1;
+       int startV = -1;
+
+       int unCheckV = 0; 
+
+       char* globalVisited = NULL;     //To handle the case when the graph is 
not connected
+       int globalMaxd = 0; 
+
+       //init
+       globalVisited = (char*) GDKmalloc(sizeof(char) * numV); 
+       for (i = 0; i < numV; i++){
+               globalVisited[i] = 0;
+       }
+       
+       //Go through each connected graph
+       for (unCheckV = 0; unCheckV < numV; unCheckV++){
+               if (csrelSet[unCheckV].numRef == 0) continue; 
+               if (unCheckV % 1000 == 0) printf("Updated globalMaxd = 
%d\n",globalMaxd);
+               startV = unCheckV;
+               bfs(numV,startV, csrelSet, &maxd, &maxV, globalVisited);
+               //printf("Max distance after the %d bfs is %d (from %d to %d)\n 
", i, maxd, startV, maxV);
+               if (maxd > globalMaxd){ 
+                       globalMaxd = maxd;
+                       //printf("New max %d from startV 
%d\n",globalMaxd,startV);
+               }
+
+               if (globalMaxd > MAX_ITERATION_NO) break; 
+       }               
+
+
+       GDKfree(globalVisited); 
+
+       printf("Diameter is %d\n",globalMaxd);
+       return globalMaxd;      
+}
diff --git a/monetdb5/extras/rdf/rdfgraph.h b/monetdb5/extras/rdf/rdfgraph.h
new file mode 100644
--- /dev/null
+++ b/monetdb5/extras/rdf/rdfgraph.h
@@ -0,0 +1,31 @@
+/*
+ * The contents of this file are subject to the MonetDB Public License
+ * Version 1.1 (the "License"); you may not use this file except in
+ * compliance with the License. You may obtain a copy of the License at
+ * http://www.monetdb.org/Legal/MonetDBLicense
+ *
+ * Software distributed under the License is distributed on an "AS IS"
+ * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+ * License for the specific language governing rights and limitations
+ * under the License.
+ *
+ * The Original Code is the MonetDB Database System.
+ *
+ * The Initial Developer of the Original Code is CWI.
+ * Portions created by CWI are Copyright (C) 1997-July 2008 CWI.
+ * Copyright August 2008-2013 MonetDB B.V.
+ * All Rights Reserved.
+ */
+
+#ifndef _RDFGRAPH_H_
+#define _RDFGRAPH_H_
+
+#include <rdfschema.h>
+
+rdf_export int
+getDiameter(int k, int numV,CSrel *csrelSet);
+
+rdf_export int 
+getDiameterExact(int numV,CSrel *csrelSet);
+
+#endif /* _RDFGRAPH_H_ */
diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -35,6 +35,7 @@
 #include "rdfminheap.h"
 #include "rdfontologyload.h"
 #include <mtime.h>
+#include <rdfgraph.h>
 
 #define SHOWPROPERTYNAME 1
 
@@ -548,34 +549,27 @@ void getIRNums(CSrel *csrelSet, CSset *f
                for (i = 0; i < num; i++){
                        lastIRScores[i] = curIRScores[i]; 
                }
-       
                /*
                printf(" ======== After %d iteration \n", k); 
                for (i = 0; i < num; i++){
-                       printf("IR score[%d] is %f \n", i, curIRScores[i]);
+                       printf("IR score[%d] is %f (support: %d)\n", i, 
curIRScores[i],freqCSset->items[i].support);
                }
                */
        }
-       
-
 
        free(lastIRScores);
 }
 
 
 static 
-void updateFreqCStype(CSset *freqCSset, int num,  float *curIRScores, int 
*refCount){
+void updateFreqCStype(CSset *freqCSset, int num,  float *curIRScores, int 
*refCount, int nIterIR){
 
        int     i; 
        int     numDimensionCS = 0; 
-       int     totalSupport = 0;       /* Total CS frequency */
-       float   threshold  = 0.0; 
-       
-       for (i = 0; i < num; i++){      
-               totalSupport += freqCSset->items[i].support; 
-       }
-       threshold = (float)totalSupport * IR_DIMENSION_THRESHOLD_PERCENTAGE; 
-       printf("Total support %d --> Threshold for dimension table is: %f \n", 
totalSupport, threshold);
+       int     threshold = 0; 
+       int     ratio; 
+
+       ratio = pow(IR_DIMENSION_FACTOR, nIterIR);
 
        printf("List of dimension tables: \n");
        for (i = 0; i < num; i++){
@@ -583,11 +577,12 @@ void updateFreqCStype(CSset *freqCSset, 
                if (freqCSset->items[i].support > MINIMUM_TABLE_SIZE) continue; 
                #endif
                if (refCount[i] < freqCSset->items[i].support) continue; 
+               threshold = freqCSset->items[i].support * ratio;
                if (curIRScores[i] < threshold) continue; 
                
                freqCSset->items[i].type = DIMENSIONCS;
                //printf("A dimension CS with IR score = %f \n", 
curIRScores[i]);
-               printf(" %d  ", i);
+               printf(" %d  (Ratio %.2f)", i, (float) 
curIRScores[i]/freqCSset->items[i].support);
                numDimensionCS++;
        }
        
@@ -4281,7 +4276,7 @@ str mergeFreqCSByS1(CSset *freqCSset, CS
 #endif
 
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to