Changeset: 9aa7d8033c08 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=9aa7d8033c08
Modified Files:
monetdb5/extras/rdf/rdf.h
monetdb5/extras/rdf/rdflabels.c
Branch: rdf
Log Message:
Add list of candidates for each CSlabel
Beside the name, a list of label candidates is stored for each CSlabel. The
candidates are used by the CS merging algorithm.
diffs (truncated from 450 to 300 lines):
diff --git a/monetdb5/extras/rdf/rdf.h b/monetdb5/extras/rdf/rdf.h
--- a/monetdb5/extras/rdf/rdf.h
+++ b/monetdb5/extras/rdf/rdf.h
@@ -106,6 +106,8 @@ typedef enum {
// Final data structure that stores the labels for tables and attributes
typedef struct CSlabel {
str name; // table name
+ str *candidates; // list of table name candidates,
candidates[0] == name
+ int candidatesCount;// number of entries in the candidates
list
str *hierarchy; // hierarchy "bottom to top"
int hierarchyCount; // number of entries in the hierarchy
list
int numProp; // number of properties, copied from
freqCSset->items[x].numProp
diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -1712,8 +1712,20 @@ void getTableName(CSlabel* label, int cs
int i, j, k;
str *tmpList;
int tmpListCount;
+ char nameFound = 0;
// --- ONTOLOGY ---
+ // add all ontology candidates to list of candidates
+ if (resultCount[csIdx] >= 1) {
+ label->candidates = realloc(label->candidates, sizeof(str) *
(label->candidatesCount + resultCount[csIdx]));
+ if (!label->candidates) fprintf(stderr, "ERROR: Couldn't
realloc memory!\n");
+ for (i = 0; i < resultCount[csIdx]; ++i) {
+ label->candidates[label->candidatesCount + i] = (char
*) malloc(sizeof(char) * (strlen(result[csIdx][i]) + 1));
+ strcpy(label->candidates[label->candidatesCount + i],
result[csIdx][i]);
+ }
+ label->candidatesCount += resultCount[csIdx];
+ }
+
// one ontology class --> use it
if (resultCount[csIdx] == 1) {
#if USE_SHORT_NAMES
@@ -1721,65 +1733,74 @@ void getTableName(CSlabel* label, int cs
#else
label->name = (char *) malloc(sizeof(char) *
(strlen(result[csIdx][0]) + 1));
strcpy(label->name, result[csIdx][0]);
+#endif
label->hierarchy = getOntoHierarchy(label->name,
&(label->hierarchyCount), ontmetadata, ontmetadataCount);
-#endif
- return;
+ nameFound = 1;
}
- // multiple ontology classes --> intersect with types
- if (resultCount[csIdx] > 1) {
- tmpList = NULL;
- tmpListCount = 0;
- // search for type values
- for (i = 0; i < typeAttributesCount; ++i) {
- for (j = 0; j < typeAttributesHistogramCount[csIdx][i];
++j) {
- if
(typeAttributesHistogram[csIdx][i][j].percent < TYPE_FREQ_THRESHOLD) break; //
sorted
- // intersect type with ontology classes
- for (k = 0; k < resultCount[csIdx]; ++k) {
- if (strcmp(result[csIdx][k],
typeAttributesHistogram[csIdx][i][j].value) == 0) {
- // found, copy ontology class
to tmpList
- tmpList = (str *)
realloc(tmpList, sizeof(str) * (tmpListCount + 1));
- if (!tmpList) fprintf(stderr,
"ERROR: Couldn't realloc memory!\n");
- tmpList[tmpListCount] =
result[csIdx][k]; // pointer, no copy
- tmpListCount += 1;
+ if (!nameFound) {
+ // multiple ontology classes --> intersect with types
+ if (resultCount[csIdx] > 1) {
+ tmpList = NULL;
+ tmpListCount = 0;
+ // search for type values
+ for (i = 0; i < typeAttributesCount; ++i) {
+ for (j = 0; j <
typeAttributesHistogramCount[csIdx][i]; ++j) {
+ if
(typeAttributesHistogram[csIdx][i][j].percent < TYPE_FREQ_THRESHOLD) break; //
sorted
+ // intersect type with ontology classes
+ for (k = 0; k < resultCount[csIdx];
++k) {
+ if (strcmp(result[csIdx][k],
typeAttributesHistogram[csIdx][i][j].value) == 0) {
+ // found, copy ontology
class to tmpList
+ tmpList = (str *)
realloc(tmpList, sizeof(str) * (tmpListCount + 1));
+ if (!tmpList)
fprintf(stderr, "ERROR: Couldn't realloc memory!\n");
+ tmpList[tmpListCount] =
result[csIdx][k]; // pointer, no copy
+ tmpListCount += 1;
+ }
}
}
}
+
+ // only one left --> use it
+ if (tmpListCount == 1) {
+#if USE_SHORT_NAMES
+ getPropNameShort(&(label->name), tmpList[0]);
+#else
+ label->name = (char *) malloc(sizeof(char) *
(strlen(tmpList[0]) + 1));
+ strcpy(label->name, tmpList[0]);
+#endif
+ label->hierarchy =
getOntoHierarchy(label->name, &(label->hierarchyCount), ontmetadata,
ontmetadataCount);
+ free(tmpList);
+ nameFound = 1;
+ }
+
+ if (!nameFound) {
+ // multiple left --> use the class that covers
most attributes, most popular ontology, ...
+ if (tmpListCount > 1) {
+#if USE_SHORT_NAMES
+ getPropNameShort(&(label->name),
tmpList[0]); // sorted
+#else
+ label->name = (char *)
malloc(sizeof(char) * (strlen(tmpList[0]) + 1));
+ strcpy(label->name, tmpList[0]); //
sorted
+#endif
+ label->hierarchy =
getOntoHierarchy(label->name, &(label->hierarchyCount), ontmetadata,
ontmetadataCount);
+ free(tmpList);
+ nameFound = 1;
+ }
+ }
+
+ if (!nameFound) {
+ // empty intersection -> use the class that
covers most attributes, most popular ontology, ..
+#if USE_SHORT_NAMES
+ getPropNameShort(&(label->name),
result[csIdx][0]); // sorted
+#else
+ label->name = (char *) malloc(sizeof(char) *
(strlen(result[csIdx][0]) + 1));
+ strcpy(label->name, result[csIdx][0]); // sorted
+#endif
+ label->hierarchy =
getOntoHierarchy(label->name, &(label->hierarchyCount), ontmetadata,
ontmetadataCount);
+ free(tmpList);
+ nameFound = 1;
+ }
}
- // only one left --> use it
- if (tmpListCount == 1) {
-#if USE_SHORT_NAMES
- getPropNameShort(&(label->name), tmpList[0]);
-#else
- label->name = (char *) malloc(sizeof(char) *
(strlen(tmpList[0]) + 1));
- strcpy(label->name, tmpList[0]);
-#endif
- label->hierarchy = getOntoHierarchy(label->name,
&(label->hierarchyCount), ontmetadata, ontmetadataCount);
- free(tmpList);
- return;
- }
- // multiple left --> use the class that covers most attributes,
most popular ontology, ...
- if (tmpListCount > 1) {
-#if USE_SHORT_NAMES
- getPropNameShort(&(label->name), tmpList[0]); // sorted
-#else
- label->name = (char *) malloc(sizeof(char) *
(strlen(tmpList[0]) + 1));
- strcpy(label->name, tmpList[0]); // sorted
-#endif
- label->hierarchy = getOntoHierarchy(label->name,
&(label->hierarchyCount), ontmetadata, ontmetadataCount);
- free(tmpList);
- return;
- }
- // empty intersection -> use the class that covers most
attributes, most popular ontology, ..
-#if USE_SHORT_NAMES
- getPropNameShort(&(label->name), result[csIdx][0]); // sorted
-#else
- label->name = (char *) malloc(sizeof(char) *
(strlen(result[csIdx][0]) + 1));
- strcpy(label->name, result[csIdx][0]); // sorted
-#endif
- label->hierarchy = getOntoHierarchy(label->name,
&(label->hierarchyCount), ontmetadata, ontmetadataCount);
- free(tmpList);
- return;
}
// --- TYPE ---
@@ -1794,54 +1815,107 @@ void getTableName(CSlabel* label, int cs
tmpList[tmpListCount] =
typeAttributesHistogram[csIdx][i][0].value; // pointer, no copy
tmpListCount += 1;
}
- // one type attribute --> use most frequent one
- if (tmpListCount == 1) {
- // only one type attribute, use most frequent value (sorted)
-#if USE_SHORT_NAMES
- getPropNameShort(&(label->name), tmpList[0]);
-#else
- label->name = (char *) malloc(sizeof(char) *
(strlen(tmpList[0]) + 1));
- strcpy(label->name, tmpList[0]);
-#endif
- return;
- }
- // multiple type attributes --> use the one with fewest occurances in
other CS's
- if (tmpListCount > 1) {
+
+ // add all most frequent type values to list of candidates
+ if (tmpListCount >= 1) {
+ int counter = 0;
+ label->candidates = realloc(label->candidates, sizeof(str) *
(label->candidatesCount + tmpListCount));
+ if (!label->candidates) fprintf(stderr, "ERROR: Couldn't
realloc memory!\n");
for (i = 0; i < typeStatCount; ++i) {
for (j = 0; j < tmpListCount; ++j) {
if (strcmp(typeStat[i].value, tmpList[j]) == 0)
{
+
label->candidates[label->candidatesCount + counter] = (char *)
malloc(sizeof(char) * (strlen(tmpList[j]) + 1));
+
strcpy(label->candidates[label->candidatesCount + counter], tmpList[j]);
+ counter++;
+ }
+ }
+ }
+ assert(counter == tmpListCount);
+ label->candidatesCount += tmpListCount;
+ }
+
+ if (!nameFound) {
+ // one type attribute --> use most frequent one
+ if (tmpListCount == 1) {
+ // only one type attribute, use most frequent value
(sorted)
#if USE_SHORT_NAMES
- getPropNameShort(&(label->name),
tmpList[j]);
+ getPropNameShort(&(label->name), tmpList[0]);
#else
- label->name = (char *)
malloc(sizeof(char) * (strlen(tmpList[j]) + 1));
- strcpy(label->name, tmpList[j]);
+ label->name = (char *) malloc(sizeof(char) *
(strlen(tmpList[0]) + 1));
+ strcpy(label->name, tmpList[0]);
#endif
- return;
+ nameFound = 1;
+ }
+ }
+
+ if (!nameFound) {
+ // multiple type attributes --> use the one with fewest
occurances in other CS's
+ if (tmpListCount > 1) {
+ for (i = 0; i < typeStatCount && !nameFound; ++i) {
+ for (j = 0; j < tmpListCount && !nameFound;
++j) {
+ if (strcmp(typeStat[i].value,
tmpList[j]) == 0) {
+#if USE_SHORT_NAMES
+
getPropNameShort(&(label->name), tmpList[j]);
+#else
+ label->name = (char *)
malloc(sizeof(char) * (strlen(tmpList[j]) + 1));
+ strcpy(label->name, tmpList[j]);
+#endif
+ nameFound = 1;
+ }
}
}
}
}
// --- FK ---
- // incident foreign keys --> use the one with the most occurances (num
and freq)
+ // add top3 fk values to list of candidates
if (links[csIdx].num > 0) {
- str propStr, tmpStr;
- takeOid(links[csIdx].fks[0].prop, &tmpStr); // sorted
- propStr = removeBrackets(tmpStr);
+ label->candidates = realloc(label->candidates, sizeof(str) *
(label->candidatesCount + MIN(3, links[csIdx].num)));
+ if (!label->candidates) fprintf(stderr, "ERROR: Couldn't
realloc memory!\n");
+ for (i = 0; i < MIN(3, links[csIdx].num); ++i) {
+ str propStr, tmpStr;
+ takeOid(links[csIdx].fks[0].prop, &tmpStr);
+ propStr = removeBrackets(tmpStr);
+
+ label->candidates[label->candidatesCount + i] = (char
*) malloc(sizeof(char) * (strlen(propStr) + 1));
+ strcpy(label->candidates[label->candidatesCount + i],
propStr);
+ }
+ label->candidatesCount += MIN(3, links[csIdx].num);
+ }
+
+ if (!nameFound) {
+ // incident foreign keys --> use the one with the most
occurances (num and freq)
+ if (links[csIdx].num > 0) {
+ str propStr, tmpStr;
+ takeOid(links[csIdx].fks[0].prop, &tmpStr); // sorted
+ propStr = removeBrackets(tmpStr);
#if USE_SHORT_NAMES
- getPropNameShort(&(label->name), propStr);
+ getPropNameShort(&(label->name), propStr);
#else
- label->name = (char *) malloc(sizeof(char) * (strlen(propStr) +
1));
- strcpy(label->name, propStr);
+ label->name = (char *) malloc(sizeof(char) *
(strlen(propStr) + 1));
+ strcpy(label->name, propStr);
#endif
- GDKfree(tmpStr);
- GDKfree(propStr);
- return;
+ GDKfree(tmpStr);
+ GDKfree(propStr);
+ nameFound = 1;
+ }
}
// --- NOTHING ---
- label->name = (char *) malloc(sizeof(char) * 6);
- strcpy(label->name, "DUMMY");
+ if (label->candidatesCount == 0) {
+ label->candidates = realloc(label->candidates, sizeof(str));
+ if (!label->candidates) fprintf(stderr, "ERROR: Couldn't
realloc memory!\n");
+ label->candidates[0] = (char *) malloc(sizeof(char) * 6);
+ strcpy(label->candidates[0], "DUMMY");
+ label->candidatesCount = 1;
+ }
+
+ if (!nameFound) {
+ label->name = (char *) malloc(sizeof(char) * 6);
+ strcpy(label->name, "DUMMY");
+ nameFound = 1;
+ }
+
return;
}
#endif
_______________________________________________
checkin-list mailing list
[email protected]
http://mail.monetdb.org/mailman/listinfo/checkin-list