Changeset: 20b5c4ef8fcc for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=20b5c4ef8fcc
Added Files:
monetdb5/extras/rdf/rdfretrieval.c
monetdb5/extras/rdf/rdfretrieval.h
Modified Files:
monetdb5/extras/rdf/Makefile.ag
monetdb5/extras/rdf/hashmap/hashmap.c
monetdb5/extras/rdf/rdflabels.c
monetdb5/extras/rdf/rdflabels.h
monetdb5/extras/rdf/rdfschema.c
monetdb5/extras/rdf/trie/trie.c
monetdb5/extras/rdf/trie/trie.h
sql/backends/monet5/sql.mx
sql/scripts/30_rdf.sql
Branch: rdf
Log Message:
Merge with Linnea's changes for labeling functions
diffs (truncated from 2015 to 300 lines):
diff --git a/monetdb5/extras/rdf/Makefile.ag b/monetdb5/extras/rdf/Makefile.ag
--- a/monetdb5/extras/rdf/Makefile.ag
+++ b/monetdb5/extras/rdf/Makefile.ag
@@ -32,7 +32,7 @@ lib__rdf = {
#MODULE
NOINST
#DIR = libdir/monetdb5
- SOURCES = rdf.h rdfschema.h rdflabels.h rdfparser.h rdfparser.c
rdfontologyload.h rdfontologyload.c rdf_shredder.c rdfalgebra.c rdfschema.c
rdflabels.c
+ SOURCES = rdf.h rdfschema.h rdflabels.h rdfretrieval.h rdfparser.h
rdfparser.c rdfontologyload.h rdfontologyload.c rdf_shredder.c rdfalgebra.c
rdfschema.c rdflabels.c rdfretrieval.c
#SEP = _
# LIBS = ./hashmap/librdfhash
diff --git a/monetdb5/extras/rdf/hashmap/hashmap.c
b/monetdb5/extras/rdf/hashmap/hashmap.c
--- a/monetdb5/extras/rdf/hashmap/hashmap.c
+++ b/monetdb5/extras/rdf/hashmap/hashmap.c
@@ -1,5 +1,6 @@
/* HashMap for the characteristic sets (CSs') in RDF */
+#include "monetdb_config.h"
#include <hashmap.h>
#include <stdlib.h>
diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -105,15 +105,17 @@ ontology ontologies[] = {
#if USE_SHORT_NAMES
/* Extracts the "human-readable" part of an URI (usually the last token). */
static
-void getPropNameShort(char* name, char* propStr) {
+void getPropNameShort(char** name, char* propStr) {
char *token;
- char uri[1000];
+ char *uri;
int length = 0; // number of tokens
char **tokenizedUri = NULL; // list of tokens
int i, j;
int fit;
// tokenize uri
+ uri = (char *) malloc(sizeof(char) * (strlen(propStr) + 1));
+ if (!uri) fprintf(stderr, "ERROR: Couldn't malloc memory!\n");
strcpy(uri, propStr); // uri will be modified during tokenization
token = strtok(uri, "/#");
while (token != NULL) {
@@ -134,12 +136,20 @@ void getPropNameShort(char* name, char*
}
if (fit) {
// found matching ontology, create label
+ int totalLength = 0;
for (i = ontologies[j].length; i < length; ++i)
{
- strcat(name, tokenizedUri[i]);
- strcat(name, "_"); // if label consists
of >=2 tokens, use underscores
+ totalLength += (strlen(tokenizedUri[i])
+ 1); // additional char for underscore
+ }
+ (*name) = (char *) malloc(sizeof(char) *
(totalLength + 1));
+ if (!(*name)) fprintf(stderr, "ERROR: Couldn't
malloc memory!\n");
+ strcpy(*name, "\0");
+
+ for (i = ontologies[j].length; i < length; ++i)
{
+ strcat(*name, tokenizedUri[i]);
+ strcat(*name, "_"); // if label
consists of >=2 tokens, use underscores
}
// remove trailing underscore
- name[strlen(name) - 1] = '\0';
+ (*name)[strlen(*name) - 1] = '\0';
free(tokenizedUri);
return;
@@ -151,12 +161,17 @@ void getPropNameShort(char* name, char*
if (length == 1) {
// value
- strcat(name, propStr);
+ (*name) = (char *) malloc(sizeof(char) * (strlen(propStr) + 1));
+ if (!(*name)) fprintf(stderr, "ERROR: Couldn't malloc
memory!\n");
+ strcpy(*name, propStr);
} else {
- strcat(name, tokenizedUri[length - 1]);
+ (*name) = (char *) malloc(sizeof(char) *
(strlen(tokenizedUri[length - 1]) + 1));
+ if (!(*name)) fprintf(stderr, "ERROR: Couldn't malloc
memory!\n");
+ strcpy(*name, tokenizedUri[length - 1]);
}
free(tokenizedUri);
+ free(uri);
return;
}
#endif
@@ -180,8 +195,8 @@ int** initTypeAttributesHistogramCount(i
}
static
-TypeAttributesFreq*** initTypeAttributesHistogram(int typeAttributesCount, int
** typeAttributesHistogramCount, int num) {
- int i, j, k;
+TypeAttributesFreq*** initTypeAttributesHistogram(int typeAttributesCount, int
num) {
+ int i, j;
TypeAttributesFreq*** typeAttributesHistogram;
typeAttributesHistogram = (TypeAttributesFreq ***)
malloc(sizeof(TypeAttributesFreq **) * num);
@@ -190,12 +205,7 @@ TypeAttributesFreq*** initTypeAttributes
typeAttributesHistogram[i] = (TypeAttributesFreq **) malloc
(sizeof(TypeAttributesFreq *) * typeAttributesCount);
if (!typeAttributesHistogram[i]) fprintf(stderr, "ERROR:
Couldn't malloc memory!\n");
for (j = 0; j < typeAttributesCount; ++j) {
- typeAttributesHistogram[i][j] = (TypeAttributesFreq *)
malloc (sizeof(TypeAttributesFreq) * typeAttributesHistogramCount[i][j]);
- if (!typeAttributesHistogram[i][j]) fprintf(stderr,
"ERROR: Couldn't malloc memory!\n");
- for (k = 0; k < typeAttributesHistogramCount[i][j];
++k) {
- typeAttributesHistogram[i][j][k].freq = 0;
- typeAttributesHistogram[i][j][k].percent = 0;
- }
+ typeAttributesHistogram[i][j] = NULL;
}
}
@@ -302,12 +312,27 @@ IncidentFKs* initLinks(int csCount) {
}
/* Modifies the parameter! */
-/* from: <URI>/ to: URI */
+/* from: <URI>/ or <URI> to: URI */
static
void removeBrackets(char** s) {
- if (strlen(*s) < 3) return;
- (*s)[strlen(*s) - 2] = '\0';
- (*s) += 1;
+ if (strlen(*s) < 2) return;
+
+ if ((*s)[0] == '<' && (*s)[strlen(*s) - 2] == '>' && (*s)[strlen(*s) -
1] == '/') {
+ // case <URI>/
+ (*s)[strlen(*s) - 2] = '\0';
+ (*s) += 1;
+ } else if ((*s)[0] == '<' && (*s)[strlen(*s) - 2] == '/' &&
(*s)[strlen(*s) - 1] == '>') {
+ // case <URI/>
+ (*s)[strlen(*s) - 2] = '\0';
+ (*s) += 1;
+ } else if ((*s)[0] == '<' && (*s)[strlen(*s) - 1] == '>') {
+ // case <URI>
+ (*s)[strlen(*s) - 1] = '\0';
+ (*s) += 1;
+ } else if ((*s)[strlen(*s) - 1] == '/') {
+ // case URI/
+ (*s)[strlen(*s) - 1] = '\0';
+ }
}
/* Modifies the parameter! */
@@ -322,13 +347,14 @@ void escapeURI(char* s) {
}
/* Modifies the parameter! */
-/* Replaces colons, quotes, spaces, and dashes with underscores. */
+/* Replaces colons, quotes, spaces, and dashes with underscores. All
lowercase. */
static
void escapeURIforSQL(char* s) {
int i;
for (i = 0; i < (int) strlen(s); ++i) {
if (s[i] == ':' || s[i] == '"' || s[i] == ' ' || s[i] == '-')
s[i] = '_';
+ s[i] = tolower(s[i]);
}
}
@@ -341,7 +367,7 @@ void convertToSQL(CSset *freqCSset, Rela
// file i/o
FILE *fout;
- char filename[100], tmp[10];
+ char filename[20], tmp[10];
// looping
int i, j, k;
@@ -360,13 +386,18 @@ void convertToSQL(CSset *freqCSset, Rela
// create statement for every table
for (i = 0; i < freqCSset->numCSadded; ++i) {
- char temp[100];
+ char *temp;
if ( freqCSset->items[i].parentFreqIdx != -1) continue; //
ignore
+ temp = (char *) malloc(sizeof(char) * (strlen(labels[i].name) +
1));
+ if (!temp) fprintf(stderr, "ERROR: Couldn't malloc memory!\n");
strcpy(temp, labels[i].name);
escapeURIforSQL(temp);
- fprintf(fout, "CREATE TABLE %s_"BUNFMT" (\nsubject VARCHAR(10)
PRIMARY KEY,\n", temp, freqCSset->items[i].csId); // TODO uppercase?
underscores?
+ fprintf(fout, "CREATE TABLE %s_"BUNFMT" (\nsubject VARCHAR(10)
PRIMARY KEY,\n", temp, freqCSset->items[i].csId); // TODO underscores?
+ free(temp);
for (j = 0; j < labels[i].numProp; ++j) {
- char temp2[100];
+ char *temp2;
+ temp2 = (char *) malloc(sizeof(char) *
(strlen(labels[i].lstProp[j]) + 1));
+ if (!temp2) fprintf(stderr, "ERROR: Couldn't malloc
memory!\n");
strcpy(temp2, labels[i].lstProp[j]);
escapeURIforSQL(temp2);
@@ -376,6 +407,7 @@ void convertToSQL(CSset *freqCSset, Rela
// last column
fprintf(fout, "%s_%d BOOLEAN\n", temp2, j);
}
+ free(temp2);
}
fprintf(fout, ");\n\n");
}
@@ -384,17 +416,23 @@ void convertToSQL(CSset *freqCSset, Rela
for (i = 0; i < freqCSset->numCSadded; ++i) {
if (freqCSset->items[i].parentFreqIdx != -1) continue; // ignore
for (j = 0; j < labels[i].numProp; ++j) {
- char temp2[100];
+ char *temp2;
int refCounter = 0;
+ temp2 = (char *) malloc(sizeof(char) *
(strlen(labels[i].lstProp[j]) + 1));
+ if (!temp2) fprintf(stderr, "ERROR: Couldn't malloc
memory!\n");
strcpy(temp2, labels[i].lstProp[j]);
escapeURIforSQL(temp2);
for (k = 0; k < relationMetadataCount[i][j]; ++k) {
int from, to;
- char tempFrom[100], tempTo[100];
+ char *tempFrom, *tempTo;
if (relationMetadata[i][j][k].percent <
FK_FREQ_THRESHOLD) continue; // foreign key is not frequent enough
from = relationMetadata[i][j][k].from;
to = relationMetadata[i][j][k].to;
+ tempFrom = (char *) malloc(sizeof(char) *
(strlen(labels[from].name) + 1));
+ if (!tempFrom) fprintf(stderr, "ERROR: Couldn't
malloc memory!\n");
+ tempTo = (char *) malloc(sizeof(char) *
(strlen(labels[to].name) + 1));
+ if (!tempTo) fprintf(stderr, "ERROR: Couldn't
malloc memory!\n");
strcpy(tempFrom, labels[from].name);
escapeURIforSQL(tempFrom);
strcpy(tempTo, labels[to].name);
@@ -403,7 +441,10 @@ void convertToSQL(CSset *freqCSset, Rela
fprintf(fout, "ALTER TABLE %s_"BUNFMT" ADD
COLUMN %s_%d_%d VARCHAR(10);\n", tempFrom, freqCSset->items[from].csId, temp2,
j, refCounter);
fprintf(fout, "ALTER TABLE %s_"BUNFMT" ADD
FOREIGN KEY (%s_%d_%d) REFERENCES %s_"BUNFMT"(subject);\n\n", tempFrom,
freqCSset->items[from].csId, temp2, j, refCounter, tempTo,
freqCSset->items[to].csId);
refCounter += 1;
+ free(tempFrom);
+ free(tempTo);
}
+ free(temp2);
}
}
@@ -411,11 +452,87 @@ void convertToSQL(CSset *freqCSset, Rela
TKNZRclose(&ret);
}
+static
+void createSQLMetadata(CSset* freqCSset, CSmergeRel* csRelBetweenMergeFreqSet,
Labels* labels) {
+ int **matrix = NULL; // matrix[from][to] frequency
+ int i, j, k;
+ FILE *fout;
+
+ // init
+ matrix = (int **) malloc(sizeof(int *) * freqCSset->numCSadded);
+ if (!matrix) fprintf(stderr, "ERROR: Couldn't malloc memory!\n");
+
+ for (i = 0; i < freqCSset->numCSadded; ++i) {
+ matrix[i] = (int *) malloc(sizeof(int) * freqCSset->numCSadded);
+ if (!matrix) fprintf(stderr, "ERROR: Couldn't realloc
memory!\n");
+
+ for (j = 0; j < freqCSset->numCSadded; ++j) {
+ matrix[i][j] = 0;
+ }
+ }
+
+ // set values
+ for (i = 0; i < freqCSset->numCSadded; ++i) {
+ if (freqCSset->items[i].parentFreqIdx != -1) continue; // ignore
+
+ for (j = 0; j < freqCSset->items[i].numProp; ++j) { // propNo
in CS order
+ // check foreign key frequency
+ int sum = 0;
+ for (k = 0; k < csRelBetweenMergeFreqSet[i].numRef;
++k) {
+ if (csRelBetweenMergeFreqSet[i].lstPropId[k] ==
freqCSset->items[i].lstProp[j]) {
+ sum +=
csRelBetweenMergeFreqSet[i].lstCnt[k];
+ }
+ }
+
+ for (k = 0; k < csRelBetweenMergeFreqSet[i].numRef;
++k) { // propNo in CSrel
+ if (csRelBetweenMergeFreqSet[i].lstPropId[k] ==
freqCSset->items[i].lstProp[j]) {
+ int to =
csRelBetweenMergeFreqSet[i].lstRefFreqIdx[k];
+ if (i == to) continue; // ignore self
references
+ if ((int) (100.0 *
csRelBetweenMergeFreqSet[i].lstCnt[k] / sum + 0.5) < FK_FREQ_THRESHOLD)
continue; // foreign key is not frequent enough
+ matrix[i][to] +=
csRelBetweenMergeFreqSet[i].lstCnt[k]; // multiple links from 'i' to 'to'? add
the frequencies
+ }
+ }
+ }
+ }
+
+ // store matrix as csv
+ fout = fopen("adjacencyList.csv", "wt");
+ for (i = 0; i < freqCSset->numCSadded; ++i) {
+ for (j = 0; j < freqCSset->numCSadded; ++j) {
+ if (matrix[i][j]) {
+ fprintf(fout, "\"%d\",\"%d\",\"%d\"\n", i, j,
matrix[i][j]);
+ }
+ }
+ }
+ fclose(fout);
+
+ // print id -> table name
+ fout = fopen("tableIdFreq.csv", "wt");
+ for (i = 0; i < freqCSset->numCSadded; ++i) {
+ char *temp;
_______________________________________________
checkin-list mailing list
[email protected]
http://mail.monetdb.org/mailman/listinfo/checkin-list