Changeset: 8c25b051ed3a for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=8c25b051ed3a
Modified Files:
monetdb5/extras/rdf/rdf.h
monetdb5/extras/rdf/rdflabels.c
monetdb5/extras/rdf/rdflabels.h
monetdb5/extras/rdf/rdfschema.c
monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:
First draft of createFinalLabels, including new UML diagram generation
diffs (truncated from 973 to 300 lines):
diff --git a/monetdb5/extras/rdf/rdf.h b/monetdb5/extras/rdf/rdf.h
--- a/monetdb5/extras/rdf/rdf.h
+++ b/monetdb5/extras/rdf/rdf.h
@@ -103,4 +103,13 @@ typedef enum {
#define N_GRAPH_BAT (MAP_LEX+1)
+// Final data structure that stores the labels for tables and attributes
+typedef struct CSlabel {
+ str name; // table name
+ str *hierarchy; // hierarchy "bottom to top"
+ int hierarchyCount; // number of entries in the hierarchy
list
+ int numProp; // number of properties, copied from
freqCSset->items[x].numProp
+ char **lstProp; // attribute names (same order as in
freqCSset->items[x].lstProp)
+} CSlabel;
+
#endif /* _RDF_H_ */
diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -300,6 +300,73 @@ Relation*** initRelationMetadata(int** r
return relationMetadata;
}
+/* Calculate frequency per foreign key relationship. */
+static
+Relation*** initRelationMetadata2(int** relationMetadataCount, CSmergeRel*
csRelBetweenMergeFreqSet, CSset* freqCSset) {
+ int i, j, k;
+ Relation*** relationMetadata;
+
+ int ret;
+ char* schema = "rdf";
+
+ TKNZRopen (NULL, &schema);
+
+ relationMetadata = (Relation ***) malloc(sizeof(Relation **) *
freqCSset->numCSadded);
+ if (!relationMetadata) fprintf(stderr, "ERROR: Couldn't malloc
memory!\n");
+ for (i = 0; i < freqCSset->numCSadded; ++i) { // CS
+ CS cs;
+ if (i == -1) continue; // ignore
+ cs = (CS) freqCSset->items[i];
+ relationMetadata[i] = (Relation **) malloc (sizeof(Relation *)
* cs.numProp);
+ if (!relationMetadata[i]) fprintf(stderr, "ERROR: Couldn't
malloc memory!\n");
+ for (j = 0; j < cs.numProp; ++j) { // propNo in CS order
+ int sum = 0;
+ relationMetadataCount[i][j] = 0;
+ relationMetadata[i][j] = NULL;
+ for (k = 0; k < csRelBetweenMergeFreqSet[i].numRef;
++k) { // propNo in CSrel
+
+ if (csRelBetweenMergeFreqSet[i].lstPropId[k] ==
cs.lstProp[j]) {
+ int toId =
csRelBetweenMergeFreqSet[i].lstRefFreqIdx[k];
+ if (toId == -1) continue; // ignore
+ relationMetadataCount[i][j] += 1;
+
+ // alloc/realloc
+ if (relationMetadataCount[i][j] == 1) {
+ // alloc
+ relationMetadata[i][j] =
(Relation *) malloc (sizeof(Relation));
+ if (!relationMetadata[i][j])
fprintf(stderr, "ERROR: Couldn't malloc memory!\n");
+ relationMetadata[i][j][0].to =
toId;
+ relationMetadata[i][j][0].from
= i;
+ relationMetadata[i][j][0].freq
= csRelBetweenMergeFreqSet[i].lstCnt[k];
+
relationMetadata[i][j][0].percent = -1;
+ } else {
+ // realloc
+ relationMetadata[i][j] =
(Relation *) realloc(relationMetadata[i][j], sizeof(Relation) *
relationMetadataCount[i][j]);
+ if (!relationMetadata[i][j])
fprintf(stderr, "ERROR: Couldn't realloc memory!\n");
+
relationMetadata[i][j][relationMetadataCount[i][j] - 1].to = toId;
+
relationMetadata[i][j][relationMetadataCount[i][j] - 1].from = i;
+
relationMetadata[i][j][relationMetadataCount[i][j] - 1].freq =
csRelBetweenMergeFreqSet[i].lstCnt[k];
+
relationMetadata[i][j][relationMetadataCount[i][j] - 1].percent = -1;
+ }
+ }
+ }
+
+ // get total count of values
+ for (k = 0; k < relationMetadataCount[i][j]; ++k) {
+ sum += relationMetadata[i][j][k].freq;
+ }
+ // assign percentage values for every value
+ for (k = 0; k < relationMetadataCount[i][j]; ++k) {
+ relationMetadata[i][j][k].percent = (int)
(100.0 * relationMetadata[i][j][k].freq / sum + 0.5);
+ }
+ }
+ }
+
+ TKNZRclose(&ret);
+
+ return relationMetadata;
+}
+
static
IncidentFKs* initLinks(int csCount) {
int i;
@@ -381,7 +448,7 @@ void escapeURIforSQL(char* s) {
/* Create SQL CREATE TABLE statements including foreign keys. */
static
-void convertToSQL(CSset *freqCSset, Relation*** relationMetadata, int**
relationMetadataCount, Labels* labels, int freqThreshold) {
+void convertToSQL(CSset *freqCSset, Relation*** relationMetadata, int**
relationMetadataCount, CSlabel* labels, int freqThreshold) {
// tokenizer
int ret;
char* schema = "rdf";
@@ -408,6 +475,9 @@ void convertToSQL(CSset *freqCSset, Rela
// create statement for every table
for (i = 0; i < freqCSset->numCSadded; ++i) {
char *temp;
+
+ if (freqCSset->items[i].parentFreqIdx != -1) continue; // ignore
+
temp = (char *) malloc(sizeof(char) * (strlen(labels[i].name) +
1));
if (!temp) fprintf(stderr, "ERROR: Couldn't malloc memory!\n");
strcpy(temp, labels[i].name);
@@ -434,6 +504,8 @@ void convertToSQL(CSset *freqCSset, Rela
// add foreign key columns and add foreign keys
for (i = 0; i < freqCSset->numCSadded; ++i) {
+ if (freqCSset->items[i].parentFreqIdx != -1) continue; // ignore
+
for (j = 0; j < labels[i].numProp; ++j) {
char *temp2;
int refCounter = 0;
@@ -472,7 +544,7 @@ void convertToSQL(CSset *freqCSset, Rela
}
static
-void createSQLMetadata(CSset* freqCSset, CSrel* csrelSet, int num, Labels*
labels, int* csIdFreqIdxMap) {
+void createSQLMetadata(CSset* freqCSset, CSmergeRel* csRelBetweenMergeFreqSet,
CSlabel* labels) {
int **matrix = NULL; // matrix[from][to] frequency
int i, j, k;
FILE *fout;
@@ -491,28 +563,27 @@ void createSQLMetadata(CSset* freqCSset,
}
// set values
- for (i = 0; i < num; ++i) {
- CS cs;
- int csId = csIdFreqIdxMap[i];
- if (csId == -1) continue; // ignore
- cs = (CS) freqCSset->items[csId];
+ for (i = 0; i < freqCSset->numCSadded; ++i) {
+ CS cs = (CS) freqCSset->items[i];
+
+ if (cs.parentFreqIdx != -1) continue; // ignore
for (j = 0; j < cs.numProp; ++j) { // propNo in CS order
// check foreign key frequency
int sum = 0;
- for (k = 0; k < csrelSet[i].numRef; ++k) {
- if (csrelSet[i].lstPropId[k] == cs.lstProp[j]) {
- sum += csrelSet[i].lstCnt[k];
+ for (k = 0; k < csRelBetweenMergeFreqSet[i].numRef;
++k) {
+ if (csRelBetweenMergeFreqSet[i].lstPropId[k] ==
cs.lstProp[j]) {
+ sum +=
csRelBetweenMergeFreqSet[i].lstCnt[k];
}
}
- for (k = 0; k < csrelSet[i].numRef; ++k) { // propNo in
CSrel
- if (csrelSet[i].lstPropId[k] == cs.lstProp[j]) {
- int toId = csIdFreqIdxMap[
csrelSet[i].lstRefCSoid[k] ];
+ for (k = 0; k < csRelBetweenMergeFreqSet[i].numRef;
++k) { // propNo in CSrel
+ if (csRelBetweenMergeFreqSet[i].lstPropId[k] ==
cs.lstProp[j]) {
+ int toId =
csRelBetweenMergeFreqSet[i].lstRefFreqIdx[k];
if (toId == -1) continue; // ignore
if (i == toId) continue; // ignore self
references
- if ((int) (100.0 *
csrelSet[i].lstCnt[k] / sum + 0.5) < FK_FREQ_THRESHOLD) continue; // foreign
key is not frequent enough
- matrix[csId][toId] +=
csrelSet[i].lstCnt[k]; // multiple links from 'i' to 'toId'? add the frequencies
+ if ((int) (100.0 *
csRelBetweenMergeFreqSet[i].lstCnt[k] / sum + 0.5) < FK_FREQ_THRESHOLD)
continue; // foreign key is not frequent enough
+ matrix[i][toId] +=
csRelBetweenMergeFreqSet[i].lstCnt[k]; // multiple links from 'i' to 'toId'?
add the frequencies
}
}
}
@@ -533,6 +604,9 @@ void createSQLMetadata(CSset* freqCSset,
fout = fopen("tableIdFreq.csv", "wt");
for (i = 0; i < freqCSset->numCSadded; ++i) {
char *temp;
+
+ if (freqCSset->items[i].parentFreqIdx != -1) continue; // ignore
+
temp = (char *) malloc(sizeof(char) * (strlen(labels[i].name) +
1));
if (!temp) fprintf(stderr, "ERROR: Couldn't malloc memory!\n");
strcpy(temp, labels[i].name);
@@ -552,7 +626,7 @@ void createSQLMetadata(CSset* freqCSset,
/* Simple representation of the final labels for tables and attributes. */
static
-void printTxt(CSset* freqCSset, Labels* labels, int freqThreshold) {
+void printTxt(CSset* freqCSset, CSlabel* labels, int freqThreshold) {
FILE *fout;
char filename[20], tmp[10];
int i, j;
@@ -564,6 +638,8 @@ void printTxt(CSset* freqCSset, Labels*
fout = fopen(filename, "wt");
for (i = 0; i < freqCSset->numCSadded; ++i) {
+ if (freqCSset->items[i].parentFreqIdx != -1) continue; // ignore
+
fprintf(fout, "%s (CS "BUNFMT"): ", labels[i].name,
freqCSset->items[i].csId);
for (j = 0; j < labels[i].numProp; ++j) {
if (j + 1 < labels[i].numProp) fprintf(fout, "%s, ",
labels[i].lstProp[j]);
@@ -1138,8 +1214,9 @@ void createOntologyLookupResult(str** re
#endif
/* Print the dot code to draw an UML-like diagram. Call: dot -Tpdf -O
<filename> to create <filename>.pdf */
+/*
static
-void printUML(CSset *freqCSset, int typeAttributesCount, TypeAttributesFreq***
typeAttributesHistogram, int** typeAttributesHistogramCount, str** result, int*
resultCount, IncidentFKs* links, Labels* labels, Relation*** relationMetadata,
int** relationMetadataCount, int freqThreshold) {
+void printUML(CSset *freqCSset, int typeAttributesCount, TypeAttributesFreq***
typeAttributesHistogram, int** typeAttributesHistogramCount, str** result, int*
resultCount, IncidentFKs* links, CSlabel* labels, Relation*** relationMetadata,
int** relationMetadataCount, int freqThreshold) {
str propStr, tmpStr;
int ret;
char* schema = "rdf";
@@ -1179,7 +1256,7 @@ void printUML(CSset *freqCSset, int type
CS cs = (CS) freqCSset->items[i];
#if SHOW_CANDIDATES
- /* DATA SOURCES */
+ // DATA SOURCES
resultStr = (char *) malloc(sizeof(char) * resultStrSize);
if (!resultStr) fprintf(stderr, "ERROR: Couldn't malloc
memory!\n");
strcpy(resultStr, "\0");
@@ -1439,11 +1516,204 @@ void printUML(CSset *freqCSset, int type
TKNZRclose(&ret);
}
+*/
+
+static
+void printUML2(CSset *freqCSset, CSlabel* labels, Relation***
relationMetadata, int** relationMetadataCount, int freqThreshold) {
+ str propStr, tmpStr;
+ int ret;
+ char* schema = "rdf";
+
+ int i, j, k;
+ FILE *fout;
+ char filename[20], tmp[10];
+
+ int smallest = -1, biggest = -1;
+
+ if (TKNZRopen (NULL, &schema) != MAL_SUCCEED) {
+ fprintf(stderr, "could not open the tokenizer\n");
+ }
+
+ strcpy(filename, "CS2max");
+ sprintf(tmp, "%d", freqThreshold);
+ strcat(filename, tmp);
+ strcat(filename, ".dot");
+
+ fout = fopen(filename, "wt");
+
+ // header
+ fprintf(fout, "digraph g {\n");
+ fprintf(fout, "graph[ratio=\"compress\"];\n");
+ fprintf(fout, "node [shape=\"none\"];\n\n");
+
+ // find biggest and smallest table
+ for (i = 0; i < freqCSset->numCSadded; ++i) {
+ CS cs = (CS) freqCSset->items[i];
+ if (cs.parentFreqIdx != -1) continue; // ignore
+
+ // first values
+ if (smallest == -1) smallest = i;
+ if (biggest == -1) biggest = i;
+
+ if (cs.coverage < freqCSset->items[smallest].coverage) smallest
= i;
+ if (cs.coverage > freqCSset->items[biggest].coverage) biggest =
i;
+ }
+
+ for (i = 0; i < freqCSset->numCSadded; ++i) {
+ int width;
+ CS cs = (CS) freqCSset->items[i];
+ if (cs.parentFreqIdx != -1) continue; // ignore
+
+ // print header
+ width = (int) ((300 + 300 *
(log10(freqCSset->items[i].coverage) -
log10(freqCSset->items[smallest].coverage)) /
(log10(freqCSset->items[biggest].coverage) -
log10(freqCSset->items[smallest].coverage))) + 0.5); // width between 300 and
600 px, using logarithm
+ fprintf(fout, "\"" BUNFMT "\" [\n", cs.csId);
+ fprintf(fout, "label = <<TABLE BORDER=\"0\" CELLBORDER=\"1\"
CELLSPACING=\"0\">\n");
+ fprintf(fout, "<TR><TD WIDTH=\"%d\"><B>%s (#triples:
%d)</B></TD></TR>\n", width, labels[i].name, cs.coverage);
+
+ for (j = 0; j < cs.numProp; ++j) {
+ char *propStrEscaped = NULL;
+ char *propStrShort = NULL;
+ str color;
+
+ takeOid(cs.lstProp[j], &tmpStr);
+
+ // copy propStr to propStrEscaped because .dot-PORTs
cannot contain colons and quotes
+ propStr = removeBrackets(tmpStr);
+ propStrEscaped = (char *) malloc(sizeof(char) *
(strlen(propStr) + 1));
+ if (!propStrEscaped) fprintf(stderr, "ERROR: Couldn't
malloc memory!\n");
+ memcpy(propStrEscaped, propStr, (strlen(propStr) + 1));
+ escapeURI(propStrEscaped);
+ getPropNameShort(&propStrShort, propStr);
+
+ // assign color (the more tuples the property occurs
in, the darker
+ if ((1.0 * cs.lstPropSupport[j])/cs.support > 0.8) {
+ color = "#5555FF";
+ } else if ((1.0 * cs.lstPropSupport[j])/cs.support >
0.6) {
+ color = "#7777FF";
+ } else if ((1.0 * cs.lstPropSupport[j])/cs.support >
0.4) {
+ color = "#9999FF";
_______________________________________________
checkin-list mailing list
[email protected]
http://mail.monetdb.org/mailman/listinfo/checkin-list