Changeset: 944815cdd7d6 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=944815cdd7d6
Modified Files:
monetdb5/extras/rdf/rdflabels.c
monetdb5/extras/rdf/rdflabels.h
Branch: rdf
Log Message:
Merge with Linnea changes in rdflabel
diffs (188 lines):
diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -851,47 +851,17 @@ int compareTypeAttributesFreqs (const vo
#endif
#if USE_TYPE_NAMES
-/* Analyze hierarchy in a list of type values, add all leaf values to the
histogram. Values that are not present in the hierarchy tree built from the
ontologies are NOT added to the histogram. */
+/* Add type values to the histogram. Values that are not present in the
hierarchy tree built from the ontologies are NOT added to the histogram. */
static
-void insertLeafsIntoTypeAttributesHistogram(oid* typeList, int typeListLength,
TypeAttributesFreq*** typeAttributesHistogram, int**
typeAttributesHistogramCount, int csFreqIdx, int type, BAT *ontmetaBat,
OntClass *ontclassSet) {
- int i, j, k;
+void insertValuesIntoTypeAttributesHistogram(oid* typeList, int
typeListLength, TypeAttributesFreq*** typeAttributesHistogram, int**
typeAttributesHistogramCount, int csFreqIdx, int type, BAT *ontmetaBat) {
+ int i, j;
int fit;
- char *leaf; // flag whether a type value in 'typeList' is a
leaf (1) or not (0)
- BUN pos;
- OntClass hierarchy;
-
- // start with: every type value is a leaf
- leaf = GDKmalloc(sizeof(char) * typeListLength);
- for (i = 0; i < typeListLength; ++i) leaf[i] = 1;
-
- // analyze hierarchy
+
for (i = 0; i < typeListLength; ++i) {
- if (!leaf[i]) continue;
- pos = BUNfnd(BATmirror(ontmetaBat), &typeList[i]);
- if (pos == BUN_NONE) {
- // no ontology information for this type value,
therefore it is not added to the hierarchy
- leaf[i] = 0;
- continue;
- }
-
- // get hierarchy of this type value
- hierarchy = ontclassSet[pos];
-
- // loop over superclasses, set leaf=0
- for (j = 0; j < hierarchy.numsc; ++j) {
- for (k = 0; k < typeListLength; ++k) {
- if (i == k) continue;
- if (ontclassSet[hierarchy.scIdxes[j]].cOid ==
typeList[k]) {
- // found superclass at position 'k'
- leaf[k] = 0;
- }
- }
- }
- }
-
- // add all leafs to the histogram
- for (i = 0; i < typeListLength; ++i) {
- if (!leaf[i]) continue;
+ BUN pos = BUNfnd(BATmirror(ontmetaBat), &typeList[i]);
+ if (pos == BUN_NONE) continue; // no ontology information,
ignore
+
+ // add to histogram
fit = 0;
for (j = 0; j < typeAttributesHistogramCount[csFreqIdx][type];
++j) {
if (typeAttributesHistogram[csFreqIdx][type][j].value
== typeList[i]) {
@@ -913,13 +883,11 @@ void insertLeafsIntoTypeAttributesHistog
typeAttributesHistogram[csFreqIdx][type][typeAttributesHistogramCount[csFreqIdx][type]
- 1].freq = 1;
}
}
-
- GDKfree(leaf);
}
/* Loop through all subjects to collect frequency statistics for type
attribute values. */
static
-void createTypeAttributesHistogram(BAT *sbat, BATiter si, BATiter pi, BATiter
oi, oid *subjCSMap, CSset *freqCSset, int *csIdFreqIdxMap, int
typeAttributesCount, TypeAttributesFreq*** typeAttributesHistogram, int**
typeAttributesHistogramCount, char** typeAttributes, BAT *ontmetaBat, OntClass
*ontclassSet) {
+void createTypeAttributesHistogram(BAT *sbat, BATiter si, BATiter pi, BATiter
oi, oid *subjCSMap, CSset *freqCSset, int *csIdFreqIdxMap, int
typeAttributesCount, TypeAttributesFreq*** typeAttributesHistogram, int**
typeAttributesHistogramCount, char** typeAttributes, BAT *ontmetaBat) {
// looping, extracting
BUN p, q;
oid *sbt, *obt, *pbt;
@@ -967,9 +935,7 @@ void createTypeAttributesHistogram(BAT *
// check if property (*pbt) is a type
for (i = 0; i < typeAttributesCount; ++i) {
if (*pbt == typeAttributesOids[i]) {
-
// prop is a type!
- csFreqIdx = csIdFreqIdxMap[subjCSMap[*sbt]];
// get object
obt = (oid *) BUNtloc(oi, p);
@@ -988,7 +954,8 @@ void createTypeAttributesHistogram(BAT *
// nothing to add to histogram
} else {
// analyze values and add to
histogram
-
insertLeafsIntoTypeAttributesHistogram(typeValues, typeValuesSize,
typeAttributesHistogram, typeAttributesHistogramCount, csFreqIdx, curT,
ontmetaBat, ontclassSet);
+ csFreqIdx =
csIdFreqIdxMap[subjCSMap[curS]]; // get csFreqIdx of last subject
+
insertValuesIntoTypeAttributesHistogram(typeValues, typeValuesSize,
typeAttributesHistogram, typeAttributesHistogramCount, csFreqIdx, curT,
ontmetaBat);
typeValuesSize = 0; // reset
}
curS = *sbt;
@@ -1008,7 +975,10 @@ void createTypeAttributesHistogram(BAT *
}
// analyze and add last set of typeValues
- if (curS != BUN_NONE && typeValuesSize != 0)
insertLeafsIntoTypeAttributesHistogram(typeValues, typeValuesSize,
typeAttributesHistogram, typeAttributesHistogramCount, csFreqIdx, curT,
ontmetaBat, ontclassSet);
+ if (curS != BUN_NONE && typeValuesSize != 0) {
+ csFreqIdx = csIdFreqIdxMap[subjCSMap[curS]]; // get csFreqIdx
of last subject
+ insertValuesIntoTypeAttributesHistogram(typeValues,
typeValuesSize, typeAttributesHistogram, typeAttributesHistogramCount,
csFreqIdx, curT, ontmetaBat);
+ }
GDKfree(typeValues);
@@ -1022,14 +992,10 @@ void createTypeAttributesHistogram(BAT *
// assign percentage
for (i = 0; i < freqCSset->numCSadded; ++i) {
for (j = 0; j < typeAttributesCount; ++j) {
- int sum = 0;
- // get total count of values
- for (k = 0; k < typeAttributesHistogramCount[i][j];
++k) {
- sum += typeAttributesHistogram[i][j][k].freq;
- }
// assign percentage values for every value
for (k = 0; k < typeAttributesHistogramCount[i][j];
++k) {
- typeAttributesHistogram[i][j][k].percent =
(int) (100.0 * typeAttributesHistogram[i][j][k].freq / sum + 0.5);
+ typeAttributesHistogram[i][j][k].percent =
(int) (100.0 * typeAttributesHistogram[i][j][k].freq /
freqCSset->items[i].support + 0.5);
+
}
}
}
@@ -2109,10 +2075,11 @@ void getTableName(CSlabel* label, int cs
oid *tmpList;
int tmpListCount;
char nameFound = 0;
+ oid maxDepthOid;
+ int maxFreq;
(void) ontmetaBat;
- (void) ontclassSet;
// --- ONTOLOGY ---
@@ -2228,7 +2195,28 @@ void getTableName(CSlabel* label, int cs
if (typeAttributesHistogram[csIdx][i][0].percent <
TYPE_FREQ_THRESHOLD) continue; // sorted
tmpList = (oid *) realloc(tmpList, sizeof(oid) * (tmpListCount
+ 1));
if (!tmpList) fprintf(stderr, "ERROR: Couldn't realloc
memory!\n");
- tmpList[tmpListCount] =
typeAttributesHistogram[csIdx][i][0].value;
+
+ // of all values that are >= TYPE_FREQ_THRESHOLD, choose the
value with the highest hierarchy level ("deepest" value)
+ maxDepthOid = typeAttributesHistogram[csIdx][i][0].value;
+ maxFreq = typeAttributesHistogram[csIdx][i][0].freq;
+ for (j = 1; j < typeAttributesHistogramCount[csIdx][i]; ++j) {
+ int depth, maxDepth;
+ int freq;
+ if (typeAttributesHistogram[csIdx][i][j].percent <
TYPE_FREQ_THRESHOLD) break;
+ depth = ontclassSet[BUNfnd(BATmirror(ontmetaBat),
&typeAttributesHistogram[csIdx][i][j].value)].hierDepth;
+ maxDepth = ontclassSet[BUNfnd(BATmirror(ontmetaBat),
&maxDepthOid)].hierDepth;;
+ freq = typeAttributesHistogram[csIdx][i][j].freq;
+ if (depth > maxDepth) {
+ // choose value with higher hierarchy level
+ maxDepthOid =
typeAttributesHistogram[csIdx][i][j].value;
+ maxFreq = freq;
+ } else if (depth == maxDepth && freq > maxFreq) {
+ // if both values are on the same level, choose
the value with higher frequency
+ maxDepthOid =
typeAttributesHistogram[csIdx][i][j].value;
+ maxFreq = freq;
+ }
+ }
+ tmpList[tmpListCount] = maxDepthOid;
tmpListCount += 1;
}
@@ -2736,7 +2724,7 @@ CSlabel* createLabels(CSset* freqCSset,
typeAttributesHistogramCount =
initTypeAttributesHistogramCount(typeAttributesCount, freqCSset->numCSadded);
typeAttributesHistogram =
initTypeAttributesHistogram(typeAttributesCount, freqCSset->numCSadded);
#if USE_TYPE_NAMES
- createTypeAttributesHistogram(sbat, si, pi, oi, subjCSMap, freqCSset,
csIdFreqIdxMap, typeAttributesCount, typeAttributesHistogram,
typeAttributesHistogramCount, typeAttributes, ontmetaBat, ontclassSet);
+ createTypeAttributesHistogram(sbat, si, pi, oi, subjCSMap, freqCSset,
csIdFreqIdxMap, typeAttributesCount, typeAttributesHistogram,
typeAttributesHistogramCount, typeAttributes, ontmetaBat);
typeStat = getTypeStats(&typeStatCount, freqCSset->numCSadded,
typeAttributesCount, typeAttributesHistogram, typeAttributesHistogramCount);
#else
(void) sbat;
diff --git a/monetdb5/extras/rdf/rdflabels.h b/monetdb5/extras/rdf/rdflabels.h
--- a/monetdb5/extras/rdf/rdflabels.h
+++ b/monetdb5/extras/rdf/rdflabels.h
@@ -91,8 +91,7 @@ enum {
} RULE;
#define FK_FREQ_THRESHOLD 25 // X % of the targeted subjects have to
be in this table
-//#define TYPE_FREQ_THRESHOLD 30 // X % of the type values have
to be this value
-#define TYPE_FREQ_THRESHOLD 0 // X % of the type values have to be
this value
+#define TYPE_FREQ_THRESHOLD 80 // X % of the type values have to be
this value
#define ONTOLOGY_FREQ_THRESHOLD 0.4 // similarity threshold for tfidf
simularity for ontology classes
#define USE_SHORT_NAMES 1 // use getPropNameShort()
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list