Changeset: 3db233ec0f01 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=3db233ec0f01
Added Files:
monetdb5/extras/rdf/rdfretrieval.c
monetdb5/extras/rdf/rdfretrieval.h
Modified Files:
monetdb5/extras/rdf/Makefile.ag
monetdb5/extras/rdf/rdfschema.c
Branch: rdf
Log Message:
Sub schema retrieval heuristics
Three Algorithms to choose a fixed number of connected CS's containing as much
data as possible.
diffs (truncated from 632 to 300 lines):
diff --git a/monetdb5/extras/rdf/Makefile.ag b/monetdb5/extras/rdf/Makefile.ag
--- a/monetdb5/extras/rdf/Makefile.ag
+++ b/monetdb5/extras/rdf/Makefile.ag
@@ -32,7 +32,7 @@ lib__rdf = {
#MODULE
NOINST
#DIR = libdir/monetdb5
- SOURCES = rdf.h rdfschema.h rdflabels.h rdfparser.h rdfparser.c
rdfontologyload.h rdfontologyload.c rdf_shredder.c rdfalgebra.c rdfschema.c
rdflabels.c
+ SOURCES = rdf.h rdfschema.h rdflabels.h rdfretrieval.h rdfparser.h
rdfparser.c rdfontologyload.h rdfontologyload.c rdf_shredder.c rdfalgebra.c
rdfschema.c rdflabels.c rdfretrieval.c
#SEP = _
# LIBS = ./hashmap/librdfhash
diff --git a/monetdb5/extras/rdf/rdfretrieval.c
b/monetdb5/extras/rdf/rdfretrieval.c
new file mode 100644
--- /dev/null
+++ b/monetdb5/extras/rdf/rdfretrieval.c
@@ -0,0 +1,533 @@
+/*
+ * The contents of this file are subject to the MonetDB Public License
+ * Version 1.1 (the "License"); you may not use this file except in
+ * compliance with the License. You may obtain a copy of the License at
+ * http://www.monetdb.org/Legal/MonetDBLicense
+ *
+ * Software distributed under the License is distributed on an "AS IS"
+ * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+ * License for the specific language governing rights and limitations
+ * under the License.
+ *
+ * The Original Code is the MonetDB Database System.
+ *
+ * The Initial Developer of the Original Code is CWI.
+ * Portions created by CWI are Copyright (C) 1997-July 2008 CWI.
+ * Copyright August 2008-2013 MonetDB B.V.
+ * All Rights Reserved.
+ */
+
+#include "monetdb_config.h"
+#include "rdf.h"
+#include "rdfretrieval.h"
+#include "rdfschema.h"
+#include "rdflabels.h"
+
+static
+char** initAdjacencyMatrix(int csCount) {
+ char **matrix = NULL; // matrix[from][to]
+ int i, j;
+
+ matrix = (char **) malloc(sizeof(char *) * csCount);
+ if (!matrix) fprintf(stderr, "ERROR: Couldn't malloc memory!\n");
+
+ for (i = 0; i < csCount; ++i) {
+ matrix[i] = (char *) malloc(sizeof(char *) * csCount);
+ if (!matrix) fprintf(stderr, "ERROR: Couldn't realloc
memory!\n");
+
+ for (j = 0; j < csCount; ++j) {
+ matrix[i][j] = 0;
+ }
+ }
+
+ return matrix;
+}
+
+static
+void createAdjacencyMatrix(char** matrix, CSset* freqCSset, CSmergeRel*
csRelBetweenMergeFreqSet) {
+ int i, j, k;
+
+ for (i = 0; i < freqCSset->numCSadded; ++i) {
+ if (freqCSset->items[i].parentFreqIdx != -1) continue; // ignore
+
+ for (j = 0; j < freqCSset->items[i].numProp; ++j) { // propNo
in CS order
+ // check foreign key frequency
+ int sum = 0;
+ for (k = 0; k < csRelBetweenMergeFreqSet[i].numRef;
++k) {
+ if (csRelBetweenMergeFreqSet[i].lstPropId[k] ==
freqCSset->items[i].lstProp[j]) {
+ sum +=
csRelBetweenMergeFreqSet[i].lstCnt[k];
+ }
+ }
+
+ for (k = 0; k < csRelBetweenMergeFreqSet[i].numRef;
++k) { // propNo in CSrel
+ if (csRelBetweenMergeFreqSet[i].lstPropId[k] ==
freqCSset->items[i].lstProp[j]) {
+ int to =
csRelBetweenMergeFreqSet[i].lstRefFreqIdx[k];
+ if (i == to) continue; // ignore self
references
+ if ((int) (100.0 *
csRelBetweenMergeFreqSet[i].lstCnt[k] / sum + 0.5) < FK_FREQ_THRESHOLD)
continue; // foreign key is not frequent enough
+ matrix[i][to] = 1;
+ }
+ }
+ }
+ }
+}
+
+static
+NodeStat* initNodeStats(CSset* freqCSset) {
+ NodeStat* nodeStats = NULL;
+ int i;
+
+ nodeStats = (NodeStat *) malloc(sizeof(NodeStat) *
freqCSset->numCSadded);
+ if (!nodeStats) fprintf(stderr, "ERROR: Couldn't malloc memory!\n");
+
+ for (i = 0; i < freqCSset->numCSadded; ++i) {
+ if (freqCSset->items[i].parentFreqIdx != -1) continue; // ignore
+ nodeStats[i].origWeight = freqCSset->items[i].support;
+ nodeStats[i].weight = freqCSset->items[i].support; // weight =
origWeight
+ nodeStats[i].steps = -1;
+ nodeStats[i].predecessor = -1;
+ }
+
+ return nodeStats;
+}
+
+static
+NodeStat* initNodeStats23(CSset* freqCSset) {
+ NodeStat* nodeStats = NULL;
+ int i;
+
+ nodeStats = (NodeStat *) malloc(sizeof(NodeStat) *
freqCSset->numCSadded);
+ if (!nodeStats) fprintf(stderr, "ERROR: Couldn't malloc memory!\n");
+
+ for (i = 0; i < freqCSset->numCSadded; ++i) {
+ if (freqCSset->items[i].parentFreqIdx != -1) continue; // ignore
+ nodeStats[i].origWeight = freqCSset->items[i].support;
+ nodeStats[i].weight = 0;
+ nodeStats[i].steps = -1; // not used
+ nodeStats[i].predecessor = 0; // not used
+ }
+
+ return nodeStats;
+}
+
+static
+void bfs1(int root, CSset* freqCSset, char** adjacencyMatrix, int* queue, int*
visited, int* isInQueue, int* queuePosition, int* queueLength, NodeStat*
nodeStats) {
+ int i;
+
+ for (i = 0; i < freqCSset->numCSadded; ++i) {
+ if (adjacencyMatrix[root][i]) {
+ if (nodeStats[i].steps == -1) {
+ // no previous path to this node
+ nodeStats[i].weight = nodeStats[root].weight +
nodeStats[i].origWeight;
+ nodeStats[i].steps = nodeStats[root].steps + 1;
+ nodeStats[i].predecessor = root;
+ } else {
+ // previous path to this node
+ // test if values should be updated
+
+ // cycle detection
+ int cycle = 0;
+ int pathId = root;
+ while (pathId != -1) {
+ if (pathId == i) {
+ // cycle found
+ cycle = 1;
+ break;
+ }
+ pathId = nodeStats[pathId].predecessor;
+ }
+ if (cycle) continue;
+
+ if (nodeStats[i].predecessor == root) {
+ // path to 'i' used 'root', has to be
updated if the weight changes
+ if (((float) (nodeStats[root].weight +
nodeStats[i].origWeight)) / (nodeStats[root].steps + 1) != ((float)
nodeStats[i].weight) / nodeStats[i].steps) {
+ // set new weight and path
+ nodeStats[i].weight =
nodeStats[root].weight + nodeStats[i].origWeight;
+ nodeStats[i].steps =
nodeStats[root].steps + 1;
+ nodeStats[i].predecessor = root;
+ // update values for subsequent
nodes
+ visited[i] = 0;
+ }
+ } else if (((float) (nodeStats[root].weight +
nodeStats[i].origWeight)) / (nodeStats[root].steps + 1) > ((float)
nodeStats[i].weight) / nodeStats[i].steps) {
+ // improved weight when accessing node
'i' via 'root'
+ // set new weight and path
+ nodeStats[i].weight =
nodeStats[root].weight + nodeStats[i].origWeight;
+ nodeStats[i].steps =
nodeStats[root].steps + 1;
+ nodeStats[i].predecessor = root;
+ // update values for subsequent nodes
+ visited[i] = 0;
+ }
+ }
+
+ if (!visited[i] && !isInQueue[i]) {
+ // add to queue
+ queue[((*queueLength + *queuePosition) %
freqCSset->numCSadded)] = i;
+ *queueLength += 1;
+ isInQueue[i] = 1;
+ }
+
+ }
+ }
+
+ if (*queueLength > 0) {
+ visited[queue[(*queuePosition % freqCSset->numCSadded)]] = 1;
+ isInQueue[queue[(*queuePosition % freqCSset->numCSadded)]] = 0;
+ *queuePosition += 1;
+ *queueLength -= 1;
+ bfs1(queue[((*queuePosition + freqCSset->numCSadded - 1) %
freqCSset->numCSadded)], freqCSset, adjacencyMatrix, queue, visited, isInQueue,
queuePosition, queueLength, nodeStats);
+ }
+}
+
+static
+void addNode1(char** adjacencyMatrix, NodeStat* nodeStats, CSset* freqCSset,
int root, char initial) {
+ int queue[freqCSset->numCSadded]; // cyclic array
+ int visited[freqCSset->numCSadded];
+ int isInQueue[freqCSset->numCSadded];
+ int queuePosition; // next element in queue to view at
+ int queueLength;
+ int pathId, pathIdTmp;
+ int i;
+
+ // init
+ for (i = 0; i < freqCSset->numCSadded; ++i) {
+ queue[i] = -1;
+ visited[i] = 0;
+ isInQueue[i] = 0;
+ }
+ visited[root] = 1;
+ queuePosition = 0;
+ queueLength = 0;
+
+ if (initial) {
+ // mark root as a "chosen node"
+ nodeStats[root].steps = 0;
+ nodeStats[root].predecessor = -1;
+ nodeStats[root].weight = 0;
+ } else {
+ // add nodes on path to queue
+ int steps = nodeStats[root].steps;
+ int i = 0;
+
+ pathId = root;
+ while (pathId != -1) {
+ ++i;
+ pathIdTmp = nodeStats[pathId].predecessor; // save
predecessor
+
+ // mark node as a "chosen node"
+ nodeStats[pathId].steps = 0;
+ nodeStats[pathId].predecessor = -1;
+ nodeStats[pathId].weight = 0;
+
+ if (nodeStats[pathIdTmp].steps == 0) break; // found
the end of the path of new nodes
+ queue[queueLength] = pathIdTmp;
+ queueLength += 1;
+ isInQueue[pathIdTmp] = 1;
+
+ pathId = pathIdTmp; // move to predecessor
+ }
+ assert(steps == i);
+ }
+
+ bfs1(root, freqCSset, adjacencyMatrix, queue, visited, isInQueue,
&queuePosition, &queueLength, nodeStats);
+}
+
+int* retrieval1(int root, int numNodesMax, int* numNodesActual, CSset*
freqCSset, CSmergeRel* csRelBetweenMergeFreqSet) {
+ char **adjacencyMatrix = NULL;
+ NodeStat *nodeStats = NULL;
+ int numNodes;
+ int *chosenNodes = NULL;
+ int i, j;
+ int sumSubjects = 0;
+ int csCount = 0;
+ int sumChosenSubjects = 0;
+
+ if (numNodesMax < 1) fprintf(stderr, "ERROR: numNodesMax < 1!\n");
+
+ chosenNodes = (int *) malloc(sizeof(int) * numNodesMax);
+ if (!chosenNodes) fprintf(stderr, "ERROR: Couldn't malloc memory!\n");
+
+ adjacencyMatrix = initAdjacencyMatrix(freqCSset->numCSadded);
+ createAdjacencyMatrix(adjacencyMatrix, freqCSset,
csRelBetweenMergeFreqSet);
+ nodeStats = initNodeStats(freqCSset);
+ numNodes = 1;
+
+ // add root node
+ addNode1(adjacencyMatrix, nodeStats, freqCSset, root, 1);
+
+ // add nodes
+ while (numNodes < numNodesMax) {
+ // get top node (highest fraction (weight/steps))
+ int top = -1;
+ for (i = 0; i < freqCSset->numCSadded; ++i) {
+ int topWeight, testWeight;
+ if (freqCSset->items[i].parentFreqIdx != -1) continue;
// ignore
+ if (nodeStats[i].steps == -1) continue; // non-reachable
+ if (nodeStats[i].steps == 0) continue; // already chosen
+ if (numNodes + nodeStats[i].steps > numNodesMax)
continue; // path too long
+ if (top == -1) {
+ top = i; // set first value
+ continue;
+ }
+ topWeight = ((float) nodeStats[top].weight) /
nodeStats[top].steps;
+ testWeight = ((float) nodeStats[i].weight) /
nodeStats[i].steps;
+ if (testWeight > topWeight) {
+ top = i;
+ }
+ }
+ if (top == -1) break; // not enough nodes found
+ numNodes += nodeStats[top].steps;
+ addNode1(adjacencyMatrix, nodeStats, freqCSset, top, 0); // add
node(s)
+ }
+
+ // store list of chosen nodes
+ printf("SUBSCHEMA:\n");
+ j = 0; // counter for chosenNodes[]
_______________________________________________
checkin-list mailing list
[email protected]
http://mail.monetdb.org/mailman/listinfo/checkin-list