Changeset: 4f9d12a701c4 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=4f9d12a701c4
Modified Files:
monetdb5/extras/rdf/rdflabels.c
monetdb5/extras/rdf/rdflabels.h
monetdb5/extras/rdf/rdfschema.c
Branch: rdf
Log Message:
Add ontology tree
Stores distribution of data, used for CS merging
diffs (truncated from 331 to 300 lines):
diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -1656,7 +1656,6 @@ void printUML2(CSset *freqCSset, CSlabel
TKNZRclose(&ret);
}
-#if USE_TABLE_NAME
static
str* getOntoHierarchy(str ontology, int* hierarchyCount, str** ontmetadata,
int ontmetadataCount) {
int i;
@@ -1677,7 +1676,7 @@ str* getOntoHierarchy(str ontology, int*
// lookup superclass
int foundTuple = 0;
for (i = 0; i < ontmetadataCount; ++i) {
- str muristr = ontmetadata[0][i];
+ str muristr = ontmetadata[0][i];
str msuperstr = ontmetadata[1][i];
if (strcmp(hierarchy[(*hierarchyCount) - 1], muristr)
== 0) {
// found entry
@@ -1707,8 +1706,6 @@ str* getOntoHierarchy(str ontology, int*
return hierarchy;
}
-#endif
-
#if USE_TABLE_NAME
/* For one CS: Choose the best table name out of all collected candidates
(ontology, type, fk). */
@@ -1972,6 +1969,182 @@ void createLinks(CSset* freqCSset, Relat
#endif
static
+void createOntoUsageTreeStatistics(OntoUsageNode* tree, int numTuples) {
+ int i;
+
+ if (tree->numChildren == 0) {
+ // leaf node
+ tree->numOccurancesSum = tree->numOccurances;
+ tree->percentage = (1.0 * tree->numOccurancesSum) / numTuples;
+ } else {
+ // inner node
+ tree->numOccurancesSum = tree->numOccurances;
+ for (i = 0; i < tree->numChildren; ++i) {
+ createOntoUsageTreeStatistics(tree->lstChildren[i],
numTuples);
+ // sum up data
+ tree->numOccurancesSum +=
tree->lstChildren[i]->numOccurancesSum;
+ }
+ tree->percentage = (1.0 * tree->numOccurancesSum) / numTuples;
+ }
+}
+
+static
+void addToOntoUsageTree(OntoUsageNode* tree, str* hierarchy, int
hierarchyCount, int numTuples) {
+ int i;
+ str uri;
+ OntoUsageNode *leaf;
+
+ if (hierarchyCount == 0) {
+ // found position in tree
+// tree->numOccurances += numTuples; // TODO cs.support not yet
available
+ tree->numOccurances += 1;
+ return;
+ }
+
+ // search through children
+ uri = hierarchy[hierarchyCount - 1];
+ hierarchyCount--;
+ for (i = 0; i < tree->numChildren; ++i) {
+ if (strcmp(tree->lstChildren[i]->uri, uri) == 0) {
+ // found
+ addToOntoUsageTree(tree->lstChildren[i], hierarchy,
hierarchyCount, numTuples);
+ return;
+ }
+ }
+
+ // child not found
+ // create leaf
+ leaf = (OntoUsageNode *) malloc(sizeof(OntoUsageNode));
+ if (!leaf)
+ fprintf(stderr, "ERROR: Couldn't malloc memory!\n");
+ leaf->parent = tree;
+ leaf->uri = (str) malloc(sizeof(char) * (strlen(uri) + 1));
+ if (!leaf->uri)
+ fprintf(stderr, "ERROR: Couldn't malloc memory!\n");
+ strcpy(leaf->uri, uri);
+ leaf->lstChildren = NULL;
+ leaf->numChildren = 0;
+ leaf->numOccurances = 0;
+ leaf->numOccurancesSum = 0;
+ leaf->percentage = 0.0;
+ // add to tree
+ tree->numChildren++;
+ tree->lstChildren = realloc(tree->lstChildren, sizeof(OntoUsageNode *)
* tree->numChildren);
+ if (!tree->lstChildren)
+ fprintf(stderr, "ERROR: Couldn't realloc memory!\n");
+ tree->lstChildren[tree->numChildren - 1] = leaf;
+ // call
+ addToOntoUsageTree(leaf, hierarchy, hierarchyCount, numTuples);
+}
+
+
+static
+void printTree(OntoUsageNode* tree, int level) {
+ int i;
+ printf("Level %d URI %s Count %d Sum %d Percent %.1f\n", level,
tree->uri, tree->numOccurances, tree->numOccurancesSum, tree->percentage * 100);
+ for (i = 0; i < tree->numChildren; ++i) {
+ printTree(tree->lstChildren[i], level+1);
+ }
+}
+
+static
+void createOntoUsageTree(OntoUsageNode** tree, CSset* freqCSset, str**
ontmetadata, int ontmetadataCount, str** result, int* resultCount, int
typeAttributesCount, TypeAttributesFreq*** typeAttributesHistogram, int**
typeAttributesHistogramCount) {
+ int i, j, k;
+ str *tmpList;
+ int tmpListCount;
+ int numTuples = 0;
+
+ // init tree with an artifical root node
+ (*tree) = (OntoUsageNode *) malloc(sizeof(OntoUsageNode));
+ if (!(*tree))
+ fprintf(stderr, "ERROR: Couldn't malloc memory!\n");
+ (*tree)->parent = NULL;
+ (*tree)->uri = NULL; // artificial root;
+ (*tree)->lstChildren = NULL;
+ (*tree)->numChildren = 0;
+ (*tree)->numOccurances = 0;
+ (*tree)->numOccurancesSum = 0;
+ (*tree)->percentage = 0.0;
+
+ // loop through data
+ for (i = 0; i < freqCSset->numCSadded; ++i) {
+ str uri;
+ int hierarchyCount = 0;
+ str* hierarchy;
+
+ // get ontology
+ // copied from getTableName, TODO improve!
+ if (resultCount[i] == 0) {
+ // no hierarchy --> ignore
+ continue;
+ } else if (resultCount[i] == 1) {
+ // one ontology class --> use it
+ uri = (char *) malloc(sizeof(char) *
(strlen(result[i][0]) + 1));
+ if (!uri)
+ fprintf(stderr, "ERROR: Couldn't malloc
memory!\n");
+ strcpy(uri, result[i][0]);
+ } else {
+ // multiple ontology classes --> intersect with types
+ tmpList = NULL;
+ tmpListCount = 0;
+ // search for type values
+ for (i = 0; i < typeAttributesCount; ++i) {
+ for (j = 0; j <
typeAttributesHistogramCount[i][i]; ++j) {
+ if
(typeAttributesHistogram[i][i][j].percent < TYPE_FREQ_THRESHOLD) break; //
sorted
+ // intersect type with ontology classes
+ for (k = 0; k < resultCount[i]; ++k) {
+ if (strcmp(result[i][k],
typeAttributesHistogram[i][i][j].value) == 0) {
+ // found, copy ontology
class to tmpList
+ tmpList = (str *)
realloc(tmpList, sizeof(str) * (tmpListCount + 1));
+ if (!tmpList)
fprintf(stderr, "ERROR: Couldn't realloc memory!\n");
+ tmpList[tmpListCount] =
result[i][k]; // pointer, no copy
+ tmpListCount += 1;
+ }
+ }
+ }
+ }
+ if (tmpListCount == 1) {
+ // only one left --> use it
+ uri = (char *) malloc(sizeof(char) *
(strlen(tmpList[0]) + 1));
+ if (!uri)
+ fprintf(stderr, "ERROR: Couldn't malloc
memory!\n");
+ strcpy(uri, tmpList[0]);
+ free(tmpList);
+ } else if (tmpListCount > 1) {
+ // multiple left --> use the class that covers
most attributes, most popular ontology, ...
+ uri = (char *) malloc(sizeof(char) *
(strlen(tmpList[0]) + 1));
+ if (!uri)
+ fprintf(stderr, "ERROR: Couldn't malloc
memory!\n");
+ strcpy(uri, tmpList[0]); // sorted
+ free(tmpList);
+ } else {
+ // empty intersection -> use the class that
covers most attributes, most popular ontology, ..
+ uri = (char *) malloc(sizeof(char) *
(strlen(result[i][0]) + 1));
+ if (!uri)
+ fprintf(stderr, "ERROR: Couldn't malloc
memory!\n");
+ strcpy(uri, result[i][0]); // sorted
+ free(tmpList);
+ }
+ }
+
+ // get ontology hierarchy
+ hierarchy = getOntoHierarchy(uri, &hierarchyCount, ontmetadata,
ontmetadataCount);
+
+ // search class in tree and add CS to statistics
+ addToOntoUsageTree(*tree, hierarchy, hierarchyCount,
freqCSset->items[i].support);
+// numTuples += freqCSset->items[i].support; // update total
number of tuples in dataset // TODO cs.support not yet available
+ numTuples += 1;
+ }
+
+ // calculate summed parameters
+ createOntoUsageTreeStatistics(*tree, numTuples);
+
+ // print
+ printf("Ontology tree:\n");
+ printTree(*tree, 0);
+}
+
+static
void freeTypeAttributesHistogram(TypeAttributesFreq***
typeAttributesHistogram, int csCount, int typeAttributesCount) {
int i, j;
@@ -2111,7 +2284,7 @@ int* getSubCS(CSset* freqCSset, int csId
}
/* Creates labels for all CS (without a parent). */
-CSlabel* createLabels(CSset* freqCSset, CSrel* csrelSet, int num, BAT *sbat,
BATiter si, BATiter pi, BATiter oi, oid *subjCSMap, BAT* mbat, int
*csIdFreqIdxMap, str** ontattributes, int ontattributesCount, str**
ontmetadata, int ontmetadataCount) {
+CSlabel* createLabels(CSset* freqCSset, CSrel* csrelSet, int num, BAT *sbat,
BATiter si, BATiter pi, BATiter oi, oid *subjCSMap, BAT* mbat, int
*csIdFreqIdxMap, str** ontattributes, int ontattributesCount, str**
ontmetadata, int ontmetadataCount, OntoUsageNode** ontoUsageTree) {
#if USE_TYPE_NAMES
char* typeAttributes[] = {
"http://ogp.me/ns#type",
@@ -2178,9 +2351,7 @@ CSlabel* createLabels(CSset* freqCSset,
// freeOntmetadata(ontmetadata);
#else
(void) ontattributesCount;
- (void) ontmetadataCount;
(void) ontattributes;
- (void) ontmetadata;
#endif
// Assigning Names
@@ -2190,6 +2361,9 @@ CSlabel* createLabels(CSset* freqCSset,
if (typeStatCount > 0) free(typeStat);
#endif
+ // Collect ontology statistics (tree)
+ createOntoUsageTree(ontoUsageTree, freqCSset, ontmetadata,
ontmetadataCount, ontologyLookupResult, ontologyLookupResultCount,
typeAttributesCount, typeAttributesHistogram, typeAttributesHistogramCount);
+
free(ontologyLookupResultCount);
freeOntologyLookupResult(ontologyLookupResult, freqCSset->numCSadded);
freeTypeAttributesHistogram(typeAttributesHistogram,
freqCSset->numCSadded, typeAttributesCount);
@@ -2311,3 +2485,22 @@ void freeFinalLabels(CSlabel* labels, CS
}
free(labels);
}
+
+void freeOntoUsageTree(OntoUsageNode* tree) {
+ int i;
+
+ if (tree->numChildren == 0) {
+ // leaf node
+ free(tree->uri);
+ free(tree);
+ return;
+ }
+
+ // inner node
+ for (i = 0; i < tree->numChildren; ++i) {
+ freeOntoUsageTree(tree->lstChildren[i]);
+ }
+ free(tree->lstChildren);
+ free(tree->uri);
+ free(tree);
+}
diff --git a/monetdb5/extras/rdf/rdflabels.h b/monetdb5/extras/rdf/rdflabels.h
--- a/monetdb5/extras/rdf/rdflabels.h
+++ b/monetdb5/extras/rdf/rdflabels.h
@@ -68,6 +68,17 @@ typedef struct TypeStat {
int freq; // number of CS's the value occurs in
} TypeStat;
+// Tree node to store the number of tuples per ontology class
+typedef struct OntoUsageNode {
+ struct OntoUsageNode *parent;
+ struct OntoUsageNode **lstChildren;
+ str uri; // TODO uri==NULL <=> artificial root
+ int numChildren;
+ int numOccurances; // TODO overflow 2,000,000?
+ int numOccurancesSum;
+ float percentage; // TODO rename, range [0..1]
+} OntoUsageNode;
+
#define FK_FREQ_THRESHOLD 25 // X % of the targeted subjects have to
be in this table
#define TYPE_FREQ_THRESHOLD 10 // X % of the type values have to be
this value
#define ONTOLOGY_FREQ_THRESHOLD 0.5 // similarity threshold for tfidf
simularity for ontology classes
@@ -80,7 +91,7 @@ typedef struct TypeStat {
#define SHOW_CANDIDATES 0 // inserts a row in UML diagrams to
show all candidate names
rdf_export CSlabel*
-createLabels(CSset* freqCSset, CSrel* csrelSet, int num, BAT *sbat, BATiter
si, BATiter pi, BATiter oi, oid *subjCSMap, BAT* mapbat, int *csIdFreqIdxMap,
str** ontattributes, int ontattributesCount, str** ontmetadata, int
ontmetadataCount);
+createLabels(CSset* freqCSset, CSrel* csrelSet, int num, BAT *sbat, BATiter
si, BATiter pi, BATiter oi, oid *subjCSMap, BAT* mapbat, int *csIdFreqIdxMap,
str** ontattributes, int ontattributesCount, str** ontmetadata, int
ontmetadataCount, OntoUsageNode** ontoUsageTree);
rdf_export CSlabel*
createFinalLabels(CSlabel* labels, CSset* freqCSset, CSmergeRel*
csRelBetweenMergeFreqSet, int freqThreshold);
@@ -91,4 +102,7 @@ freeLabels(CSlabel* labels, CSset* freqC
rdf_export void
freeFinalLabels(CSlabel* labels, CSset* freqCSset);
+rdf_export void
+freeOntoUsageTree(OntoUsageNode* tree);
_______________________________________________
checkin-list mailing list
[email protected]
http://mail.monetdb.org/mailman/listinfo/checkin-list