Changeset: 3e4ece2b7085 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=3e4ece2b7085
Modified Files:
monetdb5/extras/rdf/rdflabels.c
Branch: rdf
Log Message:
Improve label quality
- Computation of similarity between CS's and classes is now based on the
assumption that all properties of a CS should belong to one ontology class, not
that the CS has to consist of ALL properties of the corresponding ontology
class.
- Type values are usually multi-valued properties, the values represent the
hierarchy the subject belongs to (e.g., if a subject in the dbpedia dataset has
type 'Athlete', it also has types 'Person', 'Agent', 'Thing'). This hierarchy
is analyzed and only the most specific type value (the "leaf") is added to the
data structures. This improves the label candidates that are computed using
type values.
diffs (284 lines):
diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -851,19 +851,89 @@ int compareTypeAttributesFreqs (const vo
#endif
#if USE_TYPE_NAMES
+/* Analyze hierarchy in a list of type values, add all leaf values to the
histogram. Values that are not present in the hierarchy tree built from the
ontologies are NOT added to the histogram. */
+static
+void insertLeafsIntoTypeAttributesHistogram(oid* typeList, int typeListLength,
TypeAttributesFreq*** typeAttributesHistogram, int**
typeAttributesHistogramCount, int csFreqIdx, int type, BAT *ontmetaBat,
OntClass *ontclassSet) {
+ int i, j, k;
+ int fit;
+ char *leaf; // flag whether a type value in 'typeList' is a
leaf (1) or not (0)
+ BUN pos;
+ OntClass hierarchy;
+
+ // start with: every type value is a leaf
+ leaf = GDKmalloc(sizeof(char) * typeListLength);
+ for (i = 0; i < typeListLength; ++i) leaf[i] = 1;
+
+ // analyze hierarchy
+ for (i = 0; i < typeListLength; ++i) {
+ if (!leaf[i]) continue;
+ pos = BUNfnd(BATmirror(ontmetaBat), &typeList[i]);
+ if (pos == BUN_NONE) {
+ // no ontology information for this type value,
therefore it is not added to the hierarchy
+ leaf[i] = 0;
+ continue;
+ }
+
+ // get hierarchy of this type value
+ hierarchy = ontclassSet[pos];
+
+ // loop over superclasses, set leaf=0
+ for (j = 0; j < hierarchy.numsc; ++j) {
+ for (k = 0; k < typeListLength; ++k) {
+ if (i == k) continue;
+ if (ontclassSet[hierarchy.scIdxes[j]].cOid ==
typeList[k]) {
+ // found superclass at position 'k'
+ leaf[k] = 0;
+ }
+ }
+ }
+ }
+
+ // add all leafs to the histogram
+ for (i = 0; i < typeListLength; ++i) {
+ if (!leaf[i]) continue;
+ fit = 0;
+ for (j = 0; j < typeAttributesHistogramCount[csFreqIdx][type];
++j) {
+ if (typeAttributesHistogram[csFreqIdx][type][j].value
== typeList[i]) {
+ // bucket exists
+
typeAttributesHistogram[csFreqIdx][type][j].freq += 1;
+ fit = 1;
+ break;
+ }
+ }
+ if (!fit) {
+ // bucket does not exist
+ // realloc
+ typeAttributesHistogramCount[csFreqIdx][type] += 1;
+ typeAttributesHistogram[csFreqIdx][type] =
(TypeAttributesFreq *) realloc(typeAttributesHistogram[csFreqIdx][type],
sizeof(TypeAttributesFreq) * typeAttributesHistogramCount[csFreqIdx][type]);
+ if (!typeAttributesHistogram[csFreqIdx][type])
fprintf(stderr, "ERROR: Couldn't realloc memory!\n");
+
+ // insert value
+
typeAttributesHistogram[csFreqIdx][type][typeAttributesHistogramCount[csFreqIdx][type]
- 1].value = typeList[i];
+
typeAttributesHistogram[csFreqIdx][type][typeAttributesHistogramCount[csFreqIdx][type]
- 1].freq = 1;
+ }
+ }
+
+ GDKfree(leaf);
+}
+
/* Loop through all subjects to collect frequency statistics for type
attribute values. */
static
-void createTypeAttributesHistogram(BAT *sbat, BATiter si, BATiter pi, BATiter
oi, oid *subjCSMap, CSset *freqCSset, int *csIdFreqIdxMap, int
typeAttributesCount, TypeAttributesFreq*** typeAttributesHistogram, int**
typeAttributesHistogramCount, char** typeAttributes) {
+void createTypeAttributesHistogram(BAT *sbat, BATiter si, BATiter pi, BATiter
oi, oid *subjCSMap, CSset *freqCSset, int *csIdFreqIdxMap, int
typeAttributesCount, TypeAttributesFreq*** typeAttributesHistogram, int**
typeAttributesHistogramCount, char** typeAttributes, BAT *ontmetaBat, OntClass
*ontclassSet) {
// looping, extracting
BUN p, q;
oid *sbt, *obt, *pbt;
char objType;
oid objOid;
int csFreqIdx;
+ oid curS; // last subject
+ int curT; // last type (index in 'typeAttributes' array)
+ oid *typeValues; // list of type values per subject and type
+ int typeValuesSize;
+ int typeValuesMaxSize = 10;
// histogram
int i, j, k;
- int fit;
oid *typeAttributesOids = malloc(sizeof(oid) *
typeAttributesCount);
@@ -878,6 +948,11 @@ void createTypeAttributesHistogram(BAT *
TKNZRappend(&typeAttributesOids[i], &typeAttributes[i]);
}
+ curS = BUN_NONE;
+ curT = -1;
+ typeValues = GDKmalloc(sizeof(oid) * typeValuesMaxSize);
+ if (!typeValues) fprintf(stderr, "ERROR: Couldn't malloc memory!\n");
+ typeValuesSize = 0;
BATloop(sbat, p, q) {
// Get data
sbt = (oid *) BUNtloc(si, p);
@@ -907,32 +982,36 @@ void createTypeAttributesHistogram(BAT *
objOid = objOid - (objType*2 + 1) *
RDF_MIN_LITERAL; /* Get the real objOid from Map or Tokenizer */
}
- // add object to histogram
- fit = 0;
- for (j = 0; j <
typeAttributesHistogramCount[csFreqIdx][i]; ++j) {
- if
(typeAttributesHistogram[csFreqIdx][i][j].value == objOid) {
- // bucket exists
-
typeAttributesHistogram[csFreqIdx][i][j].freq += 1;
- fit = 1;
- break;
+ // if finished looping over one subject or
type, the list of type values is analyzed and added to the histogram
+ if (curS != *sbt || curT != i) {
+ if (curS == BUN_NONE || typeValuesSize
== 0) {
+ // nothing to add to histogram
+ } else {
+ // analyze values and add to
histogram
+
insertLeafsIntoTypeAttributesHistogram(typeValues, typeValuesSize,
typeAttributesHistogram, typeAttributesHistogramCount, csFreqIdx, curT,
ontmetaBat, ontclassSet);
+ typeValuesSize = 0; // reset
}
+ curS = *sbt;
+ curT = i;
}
- if (!fit) {
- // bucket does not exist
- // realloc
-
typeAttributesHistogramCount[csFreqIdx][i] += 1;
- typeAttributesHistogram[csFreqIdx][i] =
(TypeAttributesFreq *) realloc(typeAttributesHistogram[csFreqIdx][i],
sizeof(TypeAttributesFreq) * typeAttributesHistogramCount[csFreqIdx][i]);
- if
(!typeAttributesHistogram[csFreqIdx][i]) fprintf(stderr, "ERROR: Couldn't
realloc memory!\n");
-
- // insert value
-
typeAttributesHistogram[csFreqIdx][i][typeAttributesHistogramCount[csFreqIdx][i]
- 1].value = objOid;
-
typeAttributesHistogram[csFreqIdx][i][typeAttributesHistogramCount[csFreqIdx][i]
- 1].freq = 1;
+ // add value to list of type values
+ if (typeValuesSize == typeValuesMaxSize) {
+ // resize
+ typeValuesMaxSize *= 2;
+ typeValues = GDKrealloc(typeValues,
sizeof(oid) * typeValuesMaxSize);
+ if (!typeValues) fprintf(stderr,
"ERROR: Couldn't realloc memory!\n");
}
+ typeValues[typeValuesSize++] = *obt;
break;
}
}
}
+ // analyze and add last set of typeValues
+ if (curS != BUN_NONE && typeValuesSize != 0)
insertLeafsIntoTypeAttributesHistogram(typeValues, typeValuesSize,
typeAttributesHistogram, typeAttributesHistogramCount, csFreqIdx, curT,
ontmetaBat, ontclassSet);
+
+ GDKfree(typeValues);
+
// sort descending by frequency
for (i = 0; i < freqCSset->numCSadded; ++i) {
for (j = 0; j < typeAttributesCount; ++j) {
@@ -1094,7 +1173,7 @@ int compareOntologyCandidates (const voi
#if USE_ONTOLOGY_NAMES
/* For one CS: Calculate the ontology classes that are similar (tfidf) to the
list of attributes. */
static
-oid* getOntologyCandidates(oid** ontattributes, int ontattributesCount, oid**
ontmetadata, int ontmetadataCount, int *resultCount, oid **listOids, int
*listCount, int listNum, PropStat *propStat, float *totaltfidfsPerOntology) {
+oid* getOntologyCandidates(oid** ontattributes, int ontattributesCount, oid**
ontmetadata, int ontmetadataCount, int *resultCount, oid **listOids, int
*listCount, int listNum, PropStat *propStat) {
int i, j, k, l;
oid *result = NULL;
@@ -1147,6 +1226,7 @@ oid* getOntologyCandidates(oid** ontattr
BUN p, bun;
p = listOids[i][j];
bun = BUNfnd(BATmirror(propStat->pBat), (ptr) &p);
+ if (bun == BUN_NONE) continue; // property does not
belong to an ontology class and therefore has no tfidfs score
for (k = 0; k < candidatesCount[j]; ++k) { // for each
candidate
// search for this class
int found = 0;
@@ -1169,21 +1249,6 @@ oid* getOntologyCandidates(oid** ontattr
}
}
}
-
- //[DUC --- add the total tfidf score for a ontology class]
//TODO: Compute before, not here
- for (l = 0; l < num; ++l){
- for (j = 0; j < ontmetadataCount; ++j) {
- oid auri = ontmetadata[0][j];
- //printf("auri = " BUNFMT "\n", auri);
- if (auri == classStat[l].ontoClass){
- //printf("Classstat %d (uri: "BUNFMT ")
- Set totaltfidf with ontology %dth: %f \n", l, auri, j,
totaltfidfsPerOntology[j]);
- classStat[l].totaltfidfs =
totaltfidfsPerOntology[j];
- break;
- }
- }
- }
- //[ ... DUC]
-
// calculate optimal tfidf score (all properties) & normalize
tfidf sums
totalTfidfs = 0.0;
@@ -1194,11 +1259,7 @@ oid* getOntologyCandidates(oid** ontattr
totalTfidfs += (propStat->tfidfs[bun] *
propStat->tfidfs[bun]);
}
for (j = 0; j < num; ++j) {
- //classStat[j].tfidfs /= totalTfidfs; //[DUC--modify]
- //printf("original classStat[j].tfidfs = %f \n",
classStat[j].tfidfs);
- classStat[j].tfidfs = classStat[j].tfidfs /
(sqrt(totalTfidfs)*sqrt(classStat[j].totaltfidfs));
- //printf("totalTfidfs = %f ||
classStat[j].totaltfidfs = %f || classStat[j].tfidfs = %f
\n",totalTfidfs,classStat[j].totaltfidfs,classStat[j].tfidfs);
-
+ classStat[j].tfidfs /= totalTfidfs;
}
// sort by tfidf desc
@@ -1408,8 +1469,6 @@ static
void createOntologyLookupResult(oid** result, CSset* freqCSset, int*
resultCount, oid** ontattributes, int ontattributesCount, oid** ontmetadata,
int ontmetadataCount) {
int i, j;
PropStat *propStat;
- float* totaltfidfsPerOntology; //[DUC]
- oid lastUri;
propStat = initPropStat();
@@ -1417,34 +1476,6 @@ void createOntologyLookupResult(oid** re
// Not the properties from freqCS
//createPropStatistics(propStat, freqCSset->numCSadded, freqCSset);
createPropStatistics(propStat, ontattributes, ontattributesCount);
-
-
- lastUri = BUN_NONE;
- totaltfidfsPerOntology = (float*) malloc(sizeof(float) *
ontmetadataCount);
- //printf("Init tfidf for all %d ontologies \n",ontmetadataCount );
- for (i = 0; i < ontmetadataCount; ++i) {
- oid auri = ontmetadata[0][i];
-
- if (auri == lastUri){
- //printf("Duplication at %d value " BUNFMT "\n", i,
auri);
- continue;
- }
- else lastUri = auri;
- totaltfidfsPerOntology[i] = 0;
-
- for (j = 0; j < ontattributesCount; j++){
- oid tmpuri = ontattributes[0][j];
- oid aattr = ontattributes[1][j];
- if (auri == tmpuri){
- BUN bun = BUNfnd(BATmirror(propStat->pBat),
(ptr) &aattr);
- if (bun == BUN_NONE) printf("[Debug] This
cannot happen \n");
- else
- totaltfidfsPerOntology[i] +=
(propStat->tfidfs[bun] * propStat->tfidfs[bun]);
- }
- }
- //printf("Computed totaltfidfsPerOntology of ontology %d: %f
(uri = "BUNFMT")\n",i, totaltfidfsPerOntology[i],auri);
- }
- //... [DUC]
for (i = 0; i < freqCSset->numCSadded; ++i) {
CS cs;
@@ -1475,7 +1506,7 @@ void createOntologyLookupResult(oid** re
// get class names
resultCount[i] = 0;
- result[i] = getOntologyCandidates(ontattributes,
ontattributesCount, ontmetadata, ontmetadataCount, &(resultCount[i]),
propOntologiesOids, propOntologiesCount, ontologyCount,
propStat,totaltfidfsPerOntology);
+ result[i] = getOntologyCandidates(ontattributes,
ontattributesCount, ontmetadata, ontmetadataCount, &(resultCount[i]),
propOntologiesOids, propOntologiesCount, ontologyCount, propStat);
for (j = 0; j < ontologyCount; ++j) {
free(propOntologies[j]);
@@ -1486,7 +1517,6 @@ void createOntologyLookupResult(oid** re
free(propOntologiesCount);
}
freePropStat(propStat);
- free(totaltfidfsPerOntology);
}
#endif
@@ -2703,7 +2733,7 @@ CSlabel* createLabels(CSset* freqCSset,
typeAttributesHistogramCount =
initTypeAttributesHistogramCount(typeAttributesCount, freqCSset->numCSadded);
typeAttributesHistogram =
initTypeAttributesHistogram(typeAttributesCount, freqCSset->numCSadded);
#if USE_TYPE_NAMES
- createTypeAttributesHistogram(sbat, si, pi, oi, subjCSMap, freqCSset,
csIdFreqIdxMap, typeAttributesCount, typeAttributesHistogram,
typeAttributesHistogramCount, typeAttributes);
+ createTypeAttributesHistogram(sbat, si, pi, oi, subjCSMap, freqCSset,
csIdFreqIdxMap, typeAttributesCount, typeAttributesHistogram,
typeAttributesHistogramCount, typeAttributes, ontmetaBat, ontclassSet);
typeStat = getTypeStats(&typeStatCount, freqCSset->numCSadded,
typeAttributesCount, typeAttributesHistogram, typeAttributesHistogramCount);
#else
(void) sbat;
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list