Changeset: 9a9a115446e0 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=9a9a115446e0
Modified Files:
monetdb5/extras/rdf/rdfretrieval.c
monetdb5/extras/rdf/rdfretrieval.h
Branch: rdf
Log Message:
Schema overview: first version of algorithm to choose tables that provide an
overview of the SQL schema
diffs (289 lines):
diff --git a/monetdb5/extras/rdf/rdfretrieval.c
b/monetdb5/extras/rdf/rdfretrieval.c
--- a/monetdb5/extras/rdf/rdfretrieval.c
+++ b/monetdb5/extras/rdf/rdfretrieval.c
@@ -553,8 +553,257 @@ int* retrieval4(int root, int numNodesMa
return chosenNodes;
}
+static
+char** initEdgesOverview(long int* table_id, int tableCount, long int*
adjacency_from, long int* adjacency_to, int adjacencyCount) {
+ char **edges;
+ int i, j;
+
+ edges = (char **) malloc(sizeof(char *) * tableCount);
+ if (!edges) fprintf(stderr, "ERROR: Couldn't malloc memory!\n");
+
+ for (i = 0; i < tableCount; ++i) {
+ edges[i] = (char *) malloc(sizeof(char) * tableCount);
+ if (!edges[i]) fprintf(stderr, "ERROR: Couldn't malloc
memory!\n");
+ for (j = 0; j < tableCount; ++j) {
+ edges[i][j] = 0;
+ }
+ edges[i][i] = 1; // self-reachability
+ }
+
+ for (i = 0; i < adjacencyCount; ++i) {
+ long int from = adjacency_from[i];
+ long int to = adjacency_to[i];
+ int fromIdx = -1;
+ int toIdx = -1;
+
+ // index lookup
+ for (j = 0; j < tableCount; ++j) {
+ if (table_id[j] == from) {fromIdx = j;}
+ if (table_id[j] == to) {toIdx = j;}
+ if (fromIdx > -1 && toIdx > -1) {break;}
+ }
+ assert(fromIdx > -1);
+ assert(toIdx > -1);
+
+ // set edge
+ edges[fromIdx][toIdx] = 1;
+ }
+
+ return edges;
+}
+
+static
+int compareOverviewNodes (const void * a, const void * b) {
+ return ( (*(Node*)b).reachabilityCount - (*(Node*)a).reachabilityCount ); //
sort descending
+}
+
+static
+int* retrievalOverview(int* numNodesActual, long int* table_id, str*
table_name, long int* table_freq, int tableCount, long int* adjacency_from,
long int* adjacency_to, int adjacencyCount) {
+ int i, j, k;
+ char **edges;
+ int sumSubjects = 0;
+ int csCount = 0;
+ int sumChosenSubjects = 0;
+
+ int queue[tableCount]; // cyclic array
+ int isInQueue[tableCount];
+ int queuePosition; // next element in queue to view at
+ int queueLength;
+ char visited[tableCount];
+ int subgraphSize;
+ Groups groups;
+ int *chosenNodes = NULL;
+
+ groups.count = 0;
+ groups.groups = NULL;
+
+ edges = initEdgesOverview(table_id, tableCount, adjacency_from,
adjacency_to, adjacencyCount);
+
+ for (i = 0; i < tableCount; ++i) {
+ visited[i] = 0;
+ }
+
+ // split into disconnected subgraph (ignoring the direction of the
edges) using BFS
+ while (1) {
+ int root = -1;
+ for (i = 0; i < tableCount; ++i) {
+ if (!visited[i]) {
+ root = i;
+ break;
+ }
+ }
+ if (root == -1) break; // all nodes have been visited, all
subgraphs have been found
+ // init
+ subgraphSize = 0;
+
+ for (i = 0; i < tableCount; ++i) {
+ queue[i] = -1;
+ isInQueue[i] = 0;
+ }
+
+ // add root node
+ queue[0] = root;
+ queuePosition = 0;
+ queueLength = 1;
+
+ visited[root] = 1;
+ isInQueue[root] = 1;
+
+ // bfs
+ while (queueLength > 0) {
+ // dequeue next value
+ int node = queue[queuePosition % tableCount];
+ visited[node] = 1;
+ subgraphSize++;
+ isInQueue[node] = 0;
+ queuePosition += 1;
+ queueLength -= 1;
+
+ // for all adjacent edges
+ for (i = 0; i < tableCount; ++i) {
+ if (visited[i] || isInQueue[i]) continue;
+ if (edges[node][i] || edges[i][node]) {
+ // ignore direction of edge
+
+ // enqueue
+ queue[((queueLength + queuePosition) %
tableCount)] = i;
+ queueLength += 1;
+ isInQueue[i] = 1;
+ }
+ }
+ }
+
+ // store subgraph/group
+ j = 0;
+ // add group
+ groups.count++;
+ groups.groups = realloc(groups.groups, sizeof(Group) *
groups.count);
+ if (!groups.groups) fprintf(stderr, "ERROR: Couldn't realloc
memory!\n");
+ groups.groups[groups.count - 1].size = subgraphSize;
+ groups.groups[groups.count - 1].nodes = (Node *)
malloc(sizeof(Node) * subgraphSize);
+
+ for (i = 0; i < tableCount; ++i) {
+ if (visited[i] == 1) {
+ // add to group
+ groups.groups[groups.count - 1].nodes[j].idx =
i;
+ j++;
+ visited[i] = 2; // node is still marked, but
can be distinguished from nodes visited in other iterations
+ }
+ }
+ assert(j == subgraphSize);
+ }
+
+ // transitive closure (Floyd-Warshall-Algorithm)
+ for (k = 0; k < tableCount; ++k) {
+ for (i = 0; i < tableCount; ++i) {
+ for (j = 0; j < tableCount; ++j) {
+ if (i == j || i == k || j == k) continue;
+ if (edges[i][k] && edges[k][j]) edges[i][j] = 1;
+ }
+ }
+ }
+
+ // select nodes to be shown in the overview schema
+ for (i = 0; i < groups.count; ++i) {
+ int found = 0;
+
+ // count how many group members can be reached from each node
+ for (j = 0; j < groups.groups[i].size; ++j) {
+ int node = groups.groups[i].nodes[j].idx;
+ int reachabilityCount = 0;
+ for (k = 0; k < tableCount; ++k) {
+ if (edges[node][k]) {
+ reachabilityCount++;
+ }
+ }
+ groups.groups[i].nodes[j].reachabilityCount =
reachabilityCount;
+
+ if (reachabilityCount == groups.groups[i].size) {
+ // found a node that links to all other nodes
in that group --> add to overview schema
+ (*numNodesActual) += 1;
+ chosenNodes = realloc(chosenNodes, sizeof(int)
* (*numNodesActual));
+ if (!chosenNodes) fprintf(stderr, "ERROR:
Couldn't realloc memory!\n");
+ chosenNodes[*numNodesActual - 1] = node;
+ found = 1;
+ break;
+ }
+ }
+ if (!found) {
+ int node;
+ char reachability[tableCount];
+ int reachabilityCount = 0;
+ int nextNode; // position in the (sorted) list of nodes
to look at next
+
+ // greedy
+ qsort(groups.groups[i].nodes, groups.groups[i].size,
sizeof(Node), compareOverviewNodes);
+
+ // take first node (covers the most nodes)
+ node = groups.groups[i].nodes[0].idx;
+ (*numNodesActual) += 1;
+ chosenNodes = realloc(chosenNodes, sizeof(int) *
(*numNodesActual));
+ if (!chosenNodes) fprintf(stderr, "ERROR: Couldn't
realloc memory!\n");
+ chosenNodes[*numNodesActual - 1] = node;
+ nextNode = 1;
+ // store reachability vector
+ for (j = 0; j < tableCount; ++j) {
+ if (edges[node][j]) {
+ reachability[j] = 1;
+ reachabilityCount++;
+ } else {
+ reachability[j] = 0;
+ }
+ }
+ assert (groups.groups[i].nodes[0].reachabilityCount ==
reachabilityCount);
+
+ // take more nodes
+ for (j = nextNode; j < tableCount; ++j) {
+ int node = groups.groups[i].nodes[j].idx;;
+ if (reachabilityCount == groups.groups[i].size)
break;
+ if (reachability[node]) continue;
+
+ // take this node
+ (*numNodesActual) += 1;
+ chosenNodes = realloc(chosenNodes, sizeof(int)
* (*numNodesActual));
+ if (!chosenNodes) fprintf(stderr, "ERROR:
Couldn't realloc memory!\n");
+ chosenNodes[*numNodesActual - 1] = node;
+ nextNode = (j + 1);
+
+ // update reachability vector
+ for (k = 0; k < tableCount; ++k) {
+ if (edges[node][k] && !reachability[k])
{
+ reachability[k] = 1;
+ reachabilityCount++;
+ }
+ }
+ }
+ }
+ }
+
+ printf("SUBSCHEMA:\n");
+ for (i = 0; i < *numNodesActual; ++i) {
+ str name = table_name[chosenNodes[i]];
+ printf("CS %s\n", name);
+ }
+
+ // statistics
+ for (i = 0; i < tableCount; ++i) {
+ csCount += 1;
+ sumSubjects += table_freq[i];
+ for (j = 0; j < *numNodesActual; ++j) {
+ if (chosenNodes[j] == i) sumChosenSubjects +=
table_freq[i];
+ }
+ }
+ printf("COVERAGE:\n");
+ printf("%d out of %d (%f %%) using %d out of %d tables (%f %%)\n",
sumChosenSubjects, sumSubjects, (100.00 * sumChosenSubjects) / sumSubjects,
*numNodesActual, csCount, (100.00 * *numNodesActual) / csCount);
+ return chosenNodes;
+}
+
int* retrieval(int root, int numNodesMax, int* numNodesActual, long int*
table_id, str* table_name, long int* table_freq, int tableCount, long int*
adjacency_from, long int* adjacency_to, long int* adjacency_freq, int
adjacencyCount) {
- if (SUBSCHEMA_HEURISTIC == 4) {
+ if (SUBSCHEMA_HEURISTIC == 5) {
+ (void) numNodesMax;
+ (void) root;
+ return retrievalOverview(numNodesActual, table_id, table_name,
table_freq, tableCount, adjacency_from, adjacency_to, adjacencyCount);
+ } else if (SUBSCHEMA_HEURISTIC == 4) {
return retrieval4(root, numNodesMax, numNodesActual, table_id,
table_name, table_freq, tableCount, adjacency_from, adjacency_to,
adjacency_freq, adjacencyCount);
} else if (SUBSCHEMA_HEURISTIC == 3) {
(void) adjacency_freq;
diff --git a/monetdb5/extras/rdf/rdfretrieval.h
b/monetdb5/extras/rdf/rdfretrieval.h
--- a/monetdb5/extras/rdf/rdfretrieval.h
+++ b/monetdb5/extras/rdf/rdfretrieval.h
@@ -29,7 +29,22 @@ typedef struct NodeStat {
int origWeight; // = CS frequency
} NodeStat;
-#define SUBSCHEMA_HEURISTIC 4
+typedef struct Node {
+ int idx;
+ int reachabilityCount; // how many members of the group can be
reached from this node? (self-reachability is included, max value is 'size')
+} Node;
+
+typedef struct Group {
+ int size;
+ Node* nodes;
+} Group;
+
+typedef struct Groups {
+ int count;
+ Group *groups;
+} Groups;
+
+#define SUBSCHEMA_HEURISTIC 5
int* retrieval(int root, int numNodesMax, int* numNodesActual, long int*
table_id, str* table_name, long int* table_freq, int tableCount, long int*
adjacency_from, long int* adjacency_to, long int* adjacency_freq, int
adjacencyCount);
_______________________________________________
checkin-list mailing list
[email protected]
http://mail.monetdb.org/mailman/listinfo/checkin-list