Changeset: 182a90c795fc for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=182a90c795fc
Modified Files:
monetdb5/extras/rdf/rdflabels.c
monetdb5/extras/rdf/rdfretrieval.c
monetdb5/extras/rdf/rdfretrieval.h
sql/backends/monet5/sql.mx
Branch: rdf
Log Message:
Heuristic to maximize the edge-weights
Maximize the number of relationships between tuples in the selected subschema
diffs (273 lines):
diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -414,16 +414,16 @@ void convertToSQL(CSset *freqCSset, Rela
static
void createSQLMetadata(CSset* freqCSset, CSmergeRel* csRelBetweenMergeFreqSet,
Labels* labels) {
- char **matrix = NULL; // matrix[from][to]
+ int **matrix = NULL; // matrix[from][to] frequency
int i, j, k;
FILE *fout;
// init
- matrix = (char **) malloc(sizeof(char *) * freqCSset->numCSadded);
+ matrix = (int **) malloc(sizeof(int *) * freqCSset->numCSadded);
if (!matrix) fprintf(stderr, "ERROR: Couldn't malloc memory!\n");
for (i = 0; i < freqCSset->numCSadded; ++i) {
- matrix[i] = (char *) malloc(sizeof(char *) *
freqCSset->numCSadded);
+ matrix[i] = (int *) malloc(sizeof(char *) *
freqCSset->numCSadded);
if (!matrix) fprintf(stderr, "ERROR: Couldn't realloc
memory!\n");
for (j = 0; j < freqCSset->numCSadded; ++j) {
@@ -449,7 +449,7 @@ void createSQLMetadata(CSset* freqCSset,
int to =
csRelBetweenMergeFreqSet[i].lstRefFreqIdx[k];
if (i == to) continue; // ignore self
references
if ((int) (100.0 *
csRelBetweenMergeFreqSet[i].lstCnt[k] / sum + 0.5) < FK_FREQ_THRESHOLD)
continue; // foreign key is not frequent enough
- matrix[i][to] = 1;
+ matrix[i][to] +=
csRelBetweenMergeFreqSet[i].lstCnt[k]; // multiple links from 'i' to 'to'? add
the frequencies
}
}
}
@@ -460,7 +460,7 @@ void createSQLMetadata(CSset* freqCSset,
for (i = 0; i < freqCSset->numCSadded; ++i) {
for (j = 0; j < freqCSset->numCSadded; ++j) {
if (matrix[i][j]) {
- fprintf(fout, "\"%d\",\"%d\"\n",i,j);
+ fprintf(fout, "\"%d\",\"%d\",\"%d\"\n", i, j,
matrix[i][j]);
}
}
}
@@ -480,7 +480,7 @@ void createSQLMetadata(CSset* freqCSset,
fout = fopen("CSmetadata.sql", "wt");
fprintf(fout, "CREATE TABLE table_id_freq (id VARCHAR(10), name
VARCHAR(100), frequency VARCHAR(10));\n");
- fprintf(fout, "CREATE TABLE adjacency_list (from_id VARCHAR(10), to_id
VARCHAR(10));\n");
+ fprintf(fout, "CREATE TABLE adjacency_list (from_id VARCHAR(10), to_id
VARCHAR(10), frequency VARCHAR(10));\n");
fprintf(fout, "COPY INTO table_id_freq from
'/export/scratch2/linnea/dbfarm/test/tableIdFreq.csv' USING DELIMITERS
',','\\n','\"';\n");
fprintf(fout, "COPY INTO adjacency_list from
'/export/scratch2/linnea/dbfarm/test/adjacencyList.csv' USING DELIMITERS
',','\\n','\"';");
fclose(fout);
diff --git a/monetdb5/extras/rdf/rdfretrieval.c
b/monetdb5/extras/rdf/rdfretrieval.c
--- a/monetdb5/extras/rdf/rdfretrieval.c
+++ b/monetdb5/extras/rdf/rdfretrieval.c
@@ -33,6 +33,15 @@ int edgeExists(long int from, long int t
}
static
+int getTableIndex(long int id, long int* table_id, int tableCount) {
+ int i;
+ for (i = 0; i < tableCount; ++i) {
+ if (table_id[i] == id) return i;
+ }
+ return -1;
+}
+
+static
NodeStat* initNodeStats1(long int* table_freq, int tableCount) {
NodeStat* nodeStats = NULL;
int i;
@@ -466,12 +475,95 @@ int* retrieval3(int root, int numNodesMa
return chosenNodes;
}
-int* retrieval(int root, int numNodesMax, int* numNodesActual, long int*
table_id, str* table_name, long int* table_freq, int tableCount, long int*
adjacency_from, long int* adjacency_to, int adjacencyCount) {
- if (SUBSCHEMA_HEURISTIC == 3) {
+static
+int* retrieval4(int root, int numNodesMax, int* numNodesActual, long int*
table_id, str* table_name, long int* table_freq, int tableCount, long int*
adjacency_from, long int* adjacency_to, long int* adjacency_freq, int
adjacencyCount) {
+ int numNodes;
+ int *chosenNodes = NULL;
+ int i, j;
+ int sumSubjects = 0;
+ int csCount = 0;
+ int sumChosenSubjects = 0;
+
+ if (numNodesMax < 1) fprintf(stderr, "ERROR: numNodesMax < 1!\n");
+
+ chosenNodes = (int *) malloc(sizeof(int) * numNodesMax);
+ if (!chosenNodes) fprintf(stderr, "ERROR: Couldn't malloc memory!\n");
+
+ numNodes = 0;
+
+ // add root node
+ chosenNodes[numNodes] = root;
+ numNodes += 1;
+
+ // add nodes
+ while (numNodes < numNodesMax) {
+ int bestNextEdge = -1;
+ for (i = 0; i < adjacencyCount; ++i) {
+ char foundFrom = 0;
+ char foundTo = 0;
+ for (j = 0; j < numNodes; ++j) {
+ if (chosenNodes[j] ==
getTableIndex(adjacency_to[i], table_id, tableCount)) {
+ foundTo = 1;
+ break;
+ }
+ }
+ for (j = 0; j < numNodes; ++j) {
+ if (chosenNodes[j] ==
getTableIndex(adjacency_from[i], table_id, tableCount)) {
+ foundFrom = 1;
+ break;
+ }
+ }
+ if (foundFrom && !foundTo) {
+ // set or update
+ if (bestNextEdge == -1) {
+ // first edge
+ bestNextEdge = i;
+ } else {
+ if (adjacency_freq[i] >
adjacency_freq[bestNextEdge]) bestNextEdge = i;
+ }
+ }
+ }
+ if (bestNextEdge == -1) {
+ // no more edges
+ break;
+ } else {
+ chosenNodes[numNodes] =
getTableIndex(adjacency_to[bestNextEdge], table_id, tableCount);
+ numNodes += 1;
+ }
+ }
+
+ printf("SUBSCHEMA:\n");
+ for (i = 0; i < numNodes; ++i) {
+ str name = table_name[chosenNodes[i]];
+ printf("CS %s\n", name);
+ }
+
+ // statistics
+ for (i = 0; i < tableCount; ++i) {
+ csCount += 1;
+ sumSubjects += table_freq[i];
+ for (j = 0; j < numNodes; ++j) {
+ if (chosenNodes[j] == i) sumChosenSubjects +=
table_freq[i];
+ }
+ }
+ printf("COVERAGE:\n");
+ printf("%d out of %d (%f %%) using %d out of %d tables (%f %%)\n",
sumChosenSubjects, sumSubjects, (100.00 * sumChosenSubjects) / sumSubjects,
numNodes, csCount, (100.00 * numNodes) / csCount);
+
+ *numNodesActual = numNodes;
+ return chosenNodes;
+}
+
+int* retrieval(int root, int numNodesMax, int* numNodesActual, long int*
table_id, str* table_name, long int* table_freq, int tableCount, long int*
adjacency_from, long int* adjacency_to, long int* adjacency_freq, int
adjacencyCount) {
+ if (SUBSCHEMA_HEURISTIC == 4) {
+ return retrieval4(root, numNodesMax, numNodesActual, table_id,
table_name, table_freq, tableCount, adjacency_from, adjacency_to,
adjacency_freq, adjacencyCount);
+ } else if (SUBSCHEMA_HEURISTIC == 3) {
+ (void) adjacency_freq;
return retrieval3(root, numNodesMax, numNodesActual, table_id,
table_name, table_freq, tableCount, adjacency_from, adjacency_to,
adjacencyCount);
} else if (SUBSCHEMA_HEURISTIC == 2) {
+ (void) adjacency_freq;
return retrieval2(root, numNodesMax, numNodesActual, table_id,
table_name, table_freq, tableCount, adjacency_from, adjacency_to,
adjacencyCount);
}
// SUBSCHEMA_HEURISTIC == 1 or other value
+ (void) adjacency_freq;
return retrieval1(root, numNodesMax, numNodesActual, table_id,
table_name, table_freq, tableCount, adjacency_from, adjacency_to,
adjacencyCount);
};
diff --git a/monetdb5/extras/rdf/rdfretrieval.h
b/monetdb5/extras/rdf/rdfretrieval.h
--- a/monetdb5/extras/rdf/rdfretrieval.h
+++ b/monetdb5/extras/rdf/rdfretrieval.h
@@ -29,8 +29,8 @@ typedef struct NodeStat {
int origWeight; // = CS frequency
} NodeStat;
-#define SUBSCHEMA_HEURISTIC 1
+#define SUBSCHEMA_HEURISTIC 4
-int* retrieval(int root, int numNodesMax, int* numNodesActual, long int*
table_id, str* table_name, long int* table_freq, int tableCount, long int*
adjacency_from, long int* adjacency_to, int adjacencyCount);
+int* retrieval(int root, int numNodesMax, int* numNodesActual, long int*
table_id, str* table_name, long int* table_freq, int tableCount, long int*
adjacency_from, long int* adjacency_to, long int* adjacency_freq, int
adjacencyCount);
#endif /* _RDFRETRIEVAL_H_ */
diff --git a/sql/backends/monet5/sql.mx b/sql/backends/monet5/sql.mx
--- a/sql/backends/monet5/sql.mx
+++ b/sql/backends/monet5/sql.mx
@@ -7778,13 +7778,13 @@ SQLrdfRetrieveSubschema(Client cntxt, Ma
int i;
BAT *table_id_freq_id, *table_id_freq_name,
*table_id_freq_freq;
- BAT *adjacency_list_from, *adjacency_list_to;
+ BAT *adjacency_list_from, *adjacency_list_to,
*adjacency_list_freq;
BATiter table_id_freq_id_i, table_id_freq_name_i,
table_id_freq_freq_i;
- BATiter adjacency_list_from_i, adjacency_list_to_i;
+ BATiter adjacency_list_from_i, adjacency_list_to_i,
adjacency_list_freq_i;
BUN p, q;
BUN p2, q2;
BUN bun, bun2, bun3;
- BUN bun4, bun5;
+ BUN bun4, bun5, bun6;
int tableCount = 0;
long int *table_id = NULL;
@@ -7792,6 +7792,7 @@ SQLrdfRetrieveSubschema(Client cntxt, Ma
long int *table_freq = NULL;
long int *adjacency_from = NULL;
long int *adjacency_to = NULL;
+ long int *adjacency_freq = NULL;
int adjacencyCount = 0;
int root;
@@ -7810,18 +7811,21 @@ SQLrdfRetrieveSubschema(Client cntxt, Ma
table_id_freq_freq = mvc_bind(m, "sys", "table_id_freq", "frequency",
0);
adjacency_list_from = mvc_bind(m, "sys", "adjacency_list", "from_id",
0);
adjacency_list_to = mvc_bind(m, "sys", "adjacency_list", "to_id", 0);
+ adjacency_list_freq = mvc_bind(m, "sys", "adjacency_list", "frequency",
0);
table_id_freq_id_i = bat_iterator(table_id_freq_id);
table_id_freq_name_i = bat_iterator(table_id_freq_name);
table_id_freq_freq_i = bat_iterator(table_id_freq_freq);
adjacency_list_from_i = bat_iterator(adjacency_list_from);
adjacency_list_to_i = bat_iterator(adjacency_list_to);
+ adjacency_list_freq_i = bat_iterator(adjacency_list_freq);
bun = BUNfirst(table_id_freq_id);
bun2 = BUNfirst(table_id_freq_name);
bun3 = BUNfirst(table_id_freq_freq);
bun4 = BUNfirst(adjacency_list_from);
bun5 = BUNfirst(adjacency_list_to);
+ bun6 = BUNfirst(adjacency_list_freq);
// store data
BATloop(table_id_freq_id, p, q) {
@@ -7844,13 +7848,16 @@ SQLrdfRetrieveSubschema(Client cntxt, Ma
BATloop(adjacency_list_from, p2, q2) {
str from = (str) BUNtail(adjacency_list_from_i, bun4 +
adjacencyCount);
str to = (str) BUNtail(adjacency_list_to_i, bun5 +
adjacencyCount);
+ str freq = (str) BUNtail(adjacency_list_freq_i, bun6 +
adjacencyCount);
adjacency_from = realloc(adjacency_from, sizeof(long int) *
(adjacencyCount + 1));
adjacency_to = realloc(adjacency_to, sizeof(long int) *
(adjacencyCount + 1));
- if (!adjacency_from || !adjacency_to) fprintf(stderr, "ERROR:
Couldn't realloc memory!\n");
+ adjacency_freq = realloc(adjacency_freq, sizeof(long int) *
(adjacencyCount + 1));
+ if (!adjacency_from || !adjacency_to || !adjacency_freq)
fprintf(stderr, "ERROR: Couldn't realloc memory!\n");
adjacency_from[adjacencyCount] = strtol(from, NULL, 10);
adjacency_to[adjacencyCount] = strtol(to, NULL, 10);
+ adjacency_freq[adjacencyCount] = strtol(freq, NULL, 10);
adjacencyCount += 1;
}
@@ -7860,7 +7867,7 @@ SQLrdfRetrieveSubschema(Client cntxt, Ma
// call retrieval function
countActual = 0;
- nodes = retrieval(root, *count, &countActual, table_id, table_name,
table_freq, tableCount, adjacency_from, adjacency_to, adjacencyCount);
+ nodes = retrieval(root, *count, &countActual, table_id, table_name,
table_freq, tableCount, adjacency_from, adjacency_to, adjacency_freq,
adjacencyCount);
// create schema
name = "s123"; // TODO create schema name
@@ -7893,12 +7900,14 @@ SQLrdfRetrieveSubschema(Client cntxt, Ma
BBPreclaim(table_id_freq_freq);
BBPreclaim(adjacency_list_from);
BBPreclaim(adjacency_list_to);
+ BBPreclaim(adjacency_list_freq);
free(table_id);
free(table_name);
free(table_freq);
free(adjacency_from);
free(adjacency_to);
+ free(adjacency_freq);
return MAL_SUCCEED;
}
_______________________________________________
checkin-list mailing list
[email protected]
http://mail.monetdb.org/mailman/listinfo/checkin-list