Changeset: 60cf81b9ceb5 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=60cf81b9ceb5
Added Files:
monetdb5/extras/rdf/rdfgraph.c
monetdb5/extras/rdf/rdfgraph.h
Modified Files:
monetdb5/extras/rdf/Makefile.ag
monetdb5/extras/rdf/rdfschema.c
monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:
Modify the algorithm for identifying dimension table.
- The number of PR iterations is computed based on the diameter of the graph
formed by CS's and their relationships
- The threshold of the dimension table IR score is greater than pow(5, number
of iterations)
diffs (truncated from 365 to 300 lines):
diff --git a/monetdb5/extras/rdf/Makefile.ag b/monetdb5/extras/rdf/Makefile.ag
--- a/monetdb5/extras/rdf/Makefile.ag
+++ b/monetdb5/extras/rdf/Makefile.ag
@@ -32,7 +32,7 @@ lib__rdf = {
#MODULE
NOINST
#DIR = libdir/monetdb5
- SOURCES = rdf.h rdftypes.h rdfschema.h rdfminheap.h rdfminheap.c
rdflabels.h rdfretrieval.h rdfparser.h rdftypes.c rdfparser.c rdfontologyload.h
rdfontologyload.c rdf_shredder.c rdfalgebra.c rdfschema.c rdflabels.c
rdfretrieval.c
+ SOURCES = rdf.h rdftypes.h rdfschema.h rdfgraph.h rdfgraph.c
rdfminheap.h rdfminheap.c rdflabels.h rdfretrieval.h rdfparser.h rdftypes.c
rdfparser.c rdfontologyload.h rdfontologyload.c rdf_shredder.c rdfalgebra.c
rdfschema.c rdflabels.c rdfretrieval.c
#SEP = _
# LIBS = ./hashmap/librdfhash
diff --git a/monetdb5/extras/rdf/rdfgraph.c b/monetdb5/extras/rdf/rdfgraph.c
new file mode 100644
--- /dev/null
+++ b/monetdb5/extras/rdf/rdfgraph.c
@@ -0,0 +1,178 @@
+/*
+ * The contents of this file are subject to the MonetDB Public License
+ * Version 1.1 (the "License"); you may not use this file except in
+ * compliance with the License. You may obtain a copy of the License at
+ * http://www.monetdb.org/Legal/MonetDBLicense
+ *
+ * Software distributed under the License is distributed on an "AS IS"
+ * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+ * License for the specific language governing rights and limitations
+ * under the License.
+ *
+ * The Original Code is the MonetDB Database System.
+ *
+ * The Initial Developer of the Original Code is CWI.
+ * Portions created by CWI are Copyright (C) 1997-July 2008 CWI.
+ * Copyright August 2008-2013 MonetDB B.V.
+ * All Rights Reserved.
+ */
+
+/* This contains graph algorithms for the graph formed by CS's and their
relationships */
+
+#include "monetdb_config.h"
+#include "mal_exception.h"
+#include "url.h"
+#include "tokenizer.h"
+#include <gdk.h>
+#include <rdf.h>
+#include <rdfschema.h>
+#include <rdfgraph.h>
+
+
+
+static
+void visit(int curV, CSrel *csrelSet, char *visited, int* d, int level, int*
maxd, int* maxV, char* globalVisited){
+ int j;
+ int nextV;
+
+ visited[curV] = 1;
+ globalVisited[curV] = 1;
+ d[curV] = level;
+
+ //Update and store the value with max distance
+ if (d[curV] > *maxd){
+ *maxd = d[curV];
+ *maxV = curV;
+ }
+
+ //We use this condition for preventing the time consumption in
+ //finding the graph diameter. Because, even though the graph
+ //diameter may be large, we will not run more than 10 iterations
+ //for detecting the dimension table.
+ //
+ if (*maxd > MAX_ITERATION_NO){
+ return;
+ }
+
+ if (csrelSet[curV].numRef != 0){
+ for (j = 0; j < csrelSet[curV].numRef; j++){
+ nextV = csrelSet[curV].lstRefFreqIdx[j];
+
+ if (!visited[nextV]) visit(nextV, csrelSet, visited, d,
level + 1, maxd, maxV, globalVisited);
+ }
+ }
+}
+
+static
+void bfs(int numV, int start, CSrel *csrelSet, int* maxd, int* maxV, char*
globalVisited){
+
+ char* visited;
+ int i;
+ int* d; //distances
+
+ visited = (char*)GDKmalloc(sizeof(char) * numV);
+ d = (int*)GDKmalloc(sizeof(int) * numV);
+
+ for (i = 0; i < numV; i++){
+ visited[i] = 0;
+ }
+
+ *maxd = -1;
+
+ visit(start,csrelSet,visited, d, 0, maxd, maxV, globalVisited);
+
+ GDKfree(visited);
+ GDKfree(d);
+}
+
+
+//This function is implemented according to
http://link.springer.com/chapter/10.1007%2F11764298_9
+//k: Number of vertices
+//
+int getDiameter(int k, int numV,CSrel *csrelSet){
+ int i;
+ int maxd = -1;
+ int maxV = -1;
+ int startV = -1;
+
+ int unCheckV = 0;
+ int nConnectGraph = 0;
+
+ char* globalVisited = NULL; //To handle the case when the graph is
not connected
+ int globalMaxd = 0;
+
+ //init
+ globalVisited = (char*) GDKmalloc(sizeof(char) * numV);
+ for (i = 0; i < numV; i++){
+ globalVisited[i] = 0;
+ }
+
+ unCheckV = 0;
+ //Go through each connected graph
+ for (unCheckV = 0; unCheckV < numV; unCheckV++){
+ if (globalVisited[unCheckV] || csrelSet[unCheckV].numRef == 0)
continue;
+
+ //For each connected graph, run k times
+ nConnectGraph++;
+ printf("Start with connected graph %d from node %d\n",
nConnectGraph,unCheckV);
+
+ for (i = 0; i < k; i++){
+ if (i == 0) startV = unCheckV;
+ else{
+ startV = maxV;
+ }
+
+ bfs(numV,startV, csrelSet, &maxd, &maxV, globalVisited);
+ //printf("Max distance after the %d bfs is %d (from %d
to %d)\n ", i, maxd, startV, maxV);
+ if (maxd > globalMaxd) globalMaxd = maxd;
+ }
+
+ if (globalMaxd > MAX_ITERATION_NO)
+ break;
+ }
+
+
+ GDKfree(globalVisited);
+
+ printf("Diameter is %d\n",globalMaxd);
+ return globalMaxd;
+}
+
+int getDiameterExact(int numV,CSrel *csrelSet){
+ int i;
+ int maxd = -1;
+ int maxV = -1;
+ int startV = -1;
+
+ int unCheckV = 0;
+
+ char* globalVisited = NULL; //To handle the case when the graph is
not connected
+ int globalMaxd = 0;
+
+ //init
+ globalVisited = (char*) GDKmalloc(sizeof(char) * numV);
+ for (i = 0; i < numV; i++){
+ globalVisited[i] = 0;
+ }
+
+ //Go through each connected graph
+ for (unCheckV = 0; unCheckV < numV; unCheckV++){
+ if (csrelSet[unCheckV].numRef == 0) continue;
+ if (unCheckV % 1000 == 0) printf("Updated globalMaxd =
%d\n",globalMaxd);
+ startV = unCheckV;
+ bfs(numV,startV, csrelSet, &maxd, &maxV, globalVisited);
+ //printf("Max distance after the %d bfs is %d (from %d to %d)\n
", i, maxd, startV, maxV);
+ if (maxd > globalMaxd){
+ globalMaxd = maxd;
+ //printf("New max %d from startV
%d\n",globalMaxd,startV);
+ }
+
+ if (globalMaxd > MAX_ITERATION_NO) break;
+ }
+
+
+ GDKfree(globalVisited);
+
+ printf("Diameter is %d\n",globalMaxd);
+ return globalMaxd;
+}
diff --git a/monetdb5/extras/rdf/rdfgraph.h b/monetdb5/extras/rdf/rdfgraph.h
new file mode 100644
--- /dev/null
+++ b/monetdb5/extras/rdf/rdfgraph.h
@@ -0,0 +1,31 @@
+/*
+ * The contents of this file are subject to the MonetDB Public License
+ * Version 1.1 (the "License"); you may not use this file except in
+ * compliance with the License. You may obtain a copy of the License at
+ * http://www.monetdb.org/Legal/MonetDBLicense
+ *
+ * Software distributed under the License is distributed on an "AS IS"
+ * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+ * License for the specific language governing rights and limitations
+ * under the License.
+ *
+ * The Original Code is the MonetDB Database System.
+ *
+ * The Initial Developer of the Original Code is CWI.
+ * Portions created by CWI are Copyright (C) 1997-July 2008 CWI.
+ * Copyright August 2008-2013 MonetDB B.V.
+ * All Rights Reserved.
+ */
+
+#ifndef _RDFGRAPH_H_
+#define _RDFGRAPH_H_
+
+#include <rdfschema.h>
+
+rdf_export int
+getDiameter(int k, int numV,CSrel *csrelSet);
+
+rdf_export int
+getDiameterExact(int numV,CSrel *csrelSet);
+
+#endif /* _RDFGRAPH_H_ */
diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -35,6 +35,7 @@
#include "rdfminheap.h"
#include "rdfontologyload.h"
#include <mtime.h>
+#include <rdfgraph.h>
#define SHOWPROPERTYNAME 1
@@ -548,34 +549,27 @@ void getIRNums(CSrel *csrelSet, CSset *f
for (i = 0; i < num; i++){
lastIRScores[i] = curIRScores[i];
}
-
/*
printf(" ======== After %d iteration \n", k);
for (i = 0; i < num; i++){
- printf("IR score[%d] is %f \n", i, curIRScores[i]);
+ printf("IR score[%d] is %f (support: %d)\n", i,
curIRScores[i],freqCSset->items[i].support);
}
*/
}
-
-
free(lastIRScores);
}
static
-void updateFreqCStype(CSset *freqCSset, int num, float *curIRScores, int
*refCount){
+void updateFreqCStype(CSset *freqCSset, int num, float *curIRScores, int
*refCount, int nIterIR){
int i;
int numDimensionCS = 0;
- int totalSupport = 0; /* Total CS frequency */
- float threshold = 0.0;
-
- for (i = 0; i < num; i++){
- totalSupport += freqCSset->items[i].support;
- }
- threshold = (float)totalSupport * IR_DIMENSION_THRESHOLD_PERCENTAGE;
- printf("Total support %d --> Threshold for dimension table is: %f \n",
totalSupport, threshold);
+ int threshold = 0;
+ int ratio;
+
+ ratio = pow(IR_DIMENSION_FACTOR, nIterIR);
printf("List of dimension tables: \n");
for (i = 0; i < num; i++){
@@ -583,11 +577,12 @@ void updateFreqCStype(CSset *freqCSset,
if (freqCSset->items[i].support > MINIMUM_TABLE_SIZE) continue;
#endif
if (refCount[i] < freqCSset->items[i].support) continue;
+ threshold = freqCSset->items[i].support * ratio;
if (curIRScores[i] < threshold) continue;
freqCSset->items[i].type = DIMENSIONCS;
//printf("A dimension CS with IR score = %f \n",
curIRScores[i]);
- printf(" %d ", i);
+ printf(" %d (Ratio %.2f)", i, (float)
curIRScores[i]/freqCSset->items[i].support);
numDimensionCS++;
}
@@ -4281,7 +4276,7 @@ str mergeFreqCSByS1(CSset *freqCSset, CS
#endif
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list