Changeset: a5f2a79c980b for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=a5f2a79c980b
Modified Files:
monetdb5/extras/rdf/rdflabels.c
monetdb5/extras/rdf/rdfschema.c
monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:
Add hierarchy information for ontology-based name regardless of where the name
comes from.
The name can be assigned based on the type-value, ontology-class similarity or
FK. However, if the name can be found in ontology-class, we build ontology
hierarchy for that name.
This is to prevent the problem happened when most of the CS's from dbpsb
dataset get the name from their type values. These type values are also
ontology-class, but we do not build any hierarchy information for these CS's.
More strict on the merging rule using common ancestor, putting the importance
score to 0.001
diffs (226 lines):
diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -2186,6 +2186,7 @@ void getTableName(CSlabel* label, int cs
maxFreq = typeAttributesHistogram[csIdx][i][0].freq;
ontClassPos = BUNfnd(BATmirror(ontmetaBat), &maxDepthOid);
if ( ontClassPos != BUN_NONE){
+ foundOntologyTypeValue = 1;
maxDepth = ontclassSet[ontClassPos].hierDepth;
}
else{
@@ -2332,7 +2333,6 @@ void getTableName(CSlabel* label, int cs
if (choosenOntologyTypeValue == BUN_NONE && resultCount[csIdx] >= 1){
label->name = result[csIdx][bestOntCandIdx];
- label->hierarchy = getOntoHierarchy(label->name,
&(label->hierarchyCount), ontmetadata, ontmetadataCount);
nameFound = 1;
#if INFO_WHERE_NAME_FROM
label->isOntology = 1;
@@ -2369,6 +2369,16 @@ void getTableName(CSlabel* label, int cs
}
}
+
+ //Add hierarchy information for ontology-based name
+ if (nameFound){
+ ontClassPos = BUNfnd(BATmirror(ontmetaBat), &(label->name));
+ if ( ontClassPos != BUN_NONE){
+ label->hierarchy = getOntoHierarchy(label->name,
&(label->hierarchyCount), ontmetadata, ontmetadataCount);
+ }
+ }
+
+
//if no name is found, check again the typecount to assign a name
#if USE_BEST_TYPEVALUE_INSTEADOF_DUMMY
if (!nameFound){
@@ -2610,11 +2620,10 @@ void printTree(OntoUsageNode* tree, int
}
static
-void createOntoUsageTree(OntoUsageNode** tree, CSset* freqCSset, oid**
ontmetadata, int ontmetadataCount, oid** result, int* resultCount, int
typeAttributesCount, TypeAttributesFreq*** typeAttributesHistogram, int**
typeAttributesHistogramCount) {
- int i, j, k, l;
- oid *tmpList;
- int tmpListCount;
+void createOntoUsageTree(OntoUsageNode** tree, CSset* freqCSset, oid**
ontmetadata, int ontmetadataCount, BAT *ontmetaBat,CSlabel* labels) {
+ int i;
int numTuples = 0;
+ BUN pos;
// init tree with an artifical root node
(*tree) = (OntoUsageNode *) malloc(sizeof(OntoUsageNode));
@@ -2633,46 +2642,12 @@ void createOntoUsageTree(OntoUsageNode**
int hierarchyCount = 0;
oid* hierarchy;
- // get ontology
- // copied from getTableName
- if (resultCount[i] == 0) {
- // no hierarchy --> ignore
- continue;
- } else if (resultCount[i] == 1) {
- // one ontology class --> use it
- uri = result[i][0];
- } else {
- // multiple ontology classes --> intersect with types
- tmpList = NULL;
- tmpListCount = 0;
- // search for type values
- for (l = 0; l < typeAttributesCount; ++l) {
- for (j = 0; j <
typeAttributesHistogramCount[i][l]; ++j) {
- if
(typeAttributesHistogram[i][l][j].percent < TYPE_FREQ_THRESHOLD) break; //
sorted
- // intersect type with ontology classes
- for (k = 0; k < resultCount[i]; ++k) {
- if (result[i][k] ==
typeAttributesHistogram[i][l][j].value) {
- // found, copy ontology
class to tmpList
- tmpList = (oid *)
realloc(tmpList, sizeof(oid) * (tmpListCount + 1));
- if (!tmpList)
fprintf(stderr, "ERROR: Couldn't realloc memory!\n");
- tmpList[tmpListCount] =
result[i][k];
- tmpListCount += 1;
- }
- }
- }
- }
- if (tmpListCount == 1) {
- // only one left --> use it
- uri = tmpList[0];
- } else if (tmpListCount > 1) {
- // multiple left --> use the class that covers
most attributes, most popular ontology, ...
- uri = tmpList[0]; // sorted
- } else {
- // empty intersection -> use the class that
covers most attributes, most popular ontology, ..
- uri = result[i][0]; // sorted
- }
- free(tmpList);
- }
+ uri = labels[i].name;
+ if (uri == BUN_NONE) continue; //No name freqCS
+
+ //Check if the name is ontology name
+ pos = BUNfnd(BATmirror(ontmetaBat), &uri);
+ if (pos == BUN_NONE) continue; // no ontology information,
ignore
// get ontology hierarchy
hierarchy = getOntoHierarchy(uri, &hierarchyCount, ontmetadata,
ontmetadataCount);
@@ -2872,7 +2847,7 @@ CSlabel* createLabels(CSset* freqCSset,
#endif
// Collect ontology statistics (tree)
- createOntoUsageTree(ontoUsageTree, freqCSset, ontmetadata,
ontmetadataCount, ontologyLookupResult, ontologyLookupResultCount,
typeAttributesCount, typeAttributesHistogram, typeAttributesHistogramCount);
+ createOntoUsageTree(ontoUsageTree, freqCSset, ontmetadata,
ontmetadataCount, ontmetaBat, labels);
free(ontologyLookupResultCount);
freeOntologyLookupResult(ontologyLookupResult,
ontologyLookupResutMatchedProp, freqCSset->numCSadded);
diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -3868,6 +3868,7 @@ char isSemanticSimilar(int freqId1, int
int hCount1, hCount2;
int level;
OntoUsageNode *tmpNode;
+
/*
int k1, k2;
if (labels[freqId1].name == labels[freqId2].name)
@@ -3887,7 +3888,7 @@ char isSemanticSimilar(int freqId1, int
}
}
*/
-
+
// Check for the most common ancestor
hCount1 = labels[freqId1].hierarchyCount;
hCount2 = labels[freqId2].hierarchyCount;
@@ -3899,17 +3900,17 @@ char isSemanticSimilar(int freqId1, int
printf("Finding common ancestor for %d and %d \n", freqId1, freqId2 );
printf("FreqCS1: ");
for (i = 0; i < hCount1; i++){
- printf(" %s", labels[freqId1].hierarchy[hCount1-1-i]);
+ printf(" " BUNFMT, labels[freqId1].hierarchy[hCount1-1-i]);
}
printf(" \n ");
printf("FreqCS2: ");
for (i = 0; i < hCount2; i++){
- printf(" %s", labels[freqId2].hierarchy[hCount2-1-i]);
+ printf(" " BUNFMT, labels[freqId2].hierarchy[hCount2-1-i]);
}
printf(" \n ");
}
-
*/
+
if (0){
if ((freqId1 > numOrigFreqCS -1) || (freqId2 > numOrigFreqCS -1))
@@ -3951,8 +3952,10 @@ char isSemanticSimilar(int freqId1, int
oid classOid;
BUN ontClassPos;
classOid = tmpNode->uri;
+
ontClassPos = BUNfnd(BATmirror(ontmetaBat), &classOid);
assert(ontClassPos != BUN_NONE);
+
/*
if (ontClassPos != BUN_NONE){
printf(" Specific level: %d \n",
ontclassSet[ontClassPos].hierDepth);
@@ -4031,8 +4034,6 @@ void mergeCSByS2S4(CSset *freqCSset, CSl
oid name; /* Name of the common ancestor */
TFIDFInfo *tfidfInfos;
-
-
(void) labels;
(void) isLabelComparable;
@@ -4193,12 +4194,14 @@ void mergeCSByS2(CSset *freqCSset, CSlab
#if NOT_MERGE_DIMENSIONCS
if (freqCSset->items[freqId1].type == DIMENSIONCS) continue;
#endif
+
+ if ((*labels)[freqId1].hierarchyCount < 1) continue;
+
for (j = (i+1); j < curNumMergeCS; j++){
freqId2 = mergeCSFreqCSMap[j];
#if NOT_MERGE_DIMENSIONCS
if (freqCSset->items[freqId2].type == DIMENSIONCS)
continue;
#endif
-
if (isLabelComparable == 1 &&
isSemanticSimilar(freqId1, freqId2, (*labels),
ontoUsageTree,freqCSset->numOrigFreqCS, &name, ontmetaBat, ontclassSet) == 1){
//printf("Same labels between freqCS %d and
freqCS %d - Old simscore is %f \n", freqId1, freqId2, simscore);
doMerge(freqCSset, S2, freqId1, freqId2,
mergecsId, labels, ontmetadata, ontmetadataCount, name);
@@ -4272,6 +4275,7 @@ void mergeCSByS4(CSset *freqCSset, CSlab
#else
if (simscore > SIM_THRESHOLD) {
#endif
+ //printf(" Similarity score (%d and %d)
cosine = %f \n", freqId1,freqId2,simscore);
/*
if ((*labels)[freqId1].name != BUN_NONE){
takeOid((*labels)[freqId1].name,
&freqCSname1);
@@ -7526,7 +7530,6 @@ RDFextractCSwithTypes(int *ret, bat *sba
curNumMergeCS = countNumberMergeCS(freqCSset);
printf("Before using rules: Number of freqCS is: %d \n",curNumMergeCS);
-
/* ---------- S1 ------- */
mergecsId = *maxCSoid + 1;
@@ -7572,7 +7575,6 @@ RDFextractCSwithTypes(int *ret, bat *sba
mergeCSFreqCSMap = (oid*) malloc(sizeof(oid) * curNumMergeCS);
initMergeCSFreqCSMap(freqCSset, mergeCSFreqCSMap);
-
/* S5: Merged CS referred from the same CS via the same property */
if (1){
tmpCSrelToMergeCS = generateCsRelToMergeFreqSet(csrelSet, freqCSset);
diff --git a/monetdb5/extras/rdf/rdfschema.h b/monetdb5/extras/rdf/rdfschema.h
--- a/monetdb5/extras/rdf/rdfschema.h
+++ b/monetdb5/extras/rdf/rdfschema.h
@@ -238,7 +238,7 @@ typedef struct SubCSSet{
#define SIM_THRESHOLD 0.6
//#define SIM_TFIDF_THRESHOLD 0.55
#define SIM_TFIDF_THRESHOLD 0.75
-#define IMPORTANCE_THRESHOLD 0.01 //This is used when merging CS's by common
ancestor
+#define IMPORTANCE_THRESHOLD 0.001 //This is used when merging CS's by common
ancestor
#define COMMON_ANCESTOR_LOWEST_SPECIFIC_LEVEL 2
#define MIN_PERCETAGE_S5 5 // Merge all CS refered by more than
1/MIN_PERCETAGE_S6 percent of a CS via one property
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list