Changeset: f228354ea441 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=f228354ea441
Modified Files:
monetdb5/extras/rdf/rdfschema.c
monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:
Using the hierarchy information for merging CS's.
Find the common ancestor using the hierarchy information
diffs (128 lines):
diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -2665,10 +2665,15 @@ void mergeMaxFreqCSByS6(CSrel *csrelBetw
}
static
-char isSemanticSimilar(int freqId1, int freqId2, CSlabel* labels){ /*Rule
S1 S2 S3*/
+char isSemanticSimilar(int freqId1, int freqId2, CSlabel* labels,
OntoUsageNode *tree){ /*Rule S1 S2 S3*/
int i, j;
int k1, k2;
-
+ //int commonHierarchy = -1;
+ int minCount = 0;
+ int hCount1, hCount2;
+ int level;
+ OntoUsageNode *tmpNode;
+
if (strcmp(labels[freqId1].name, labels[freqId2].name) == 0)
return 1;
else{ /* Check top k candidates */
@@ -2677,16 +2682,66 @@ char isSemanticSimilar(int freqId1, int
for (i = 0; i < k1; i++){
for (j = 0; j < k2; j++){
- if (strcmp(labels[freqId1].candidates[i],
labels[freqId2].candidates[j]) == 0) return 1;
+ if (strcmp(labels[freqId1].candidates[i],
labels[freqId2].candidates[j]) == 0)
+ {
+ //printf("FreqCS %d and %d shares top-k
candidates i = %d, j = %d: %s \n", freqId1, freqId2, i, j,
labels[freqId2].candidates[j]);
+ return 1;
+ }
}
}
}
+ // Check for the most common ancestor
+ hCount1 = labels[freqId1].hierarchyCount;
+ hCount2 = labels[freqId2].hierarchyCount;
+ minCount = (hCount1 > hCount2)?hCount2:hCount1;
+
+ /*
+ printf("minCount = %d \n", minCount);
+ printf("Finding common ancestor for %d and %d \n", freqId1, freqId2 );
+ printf("FreqCS1: ");
+ for (i = 0; i < hCount1; i++){
+ printf(" %s", labels[freqId1].hierarchy[hCount1-1-i]);
+ }
+ printf(" \n ");
+ printf("FreqCS2: ");
+ for (i = 0; i < hCount2; i++){
+ printf(" %s", labels[freqId2].hierarchy[hCount2-1-i]);
+ }
+ printf(" \n ");
+ */
+
+ for (i = 0; i < minCount; i++){
+ if (strcmp(labels[freqId1].hierarchy[hCount1-1-i],
labels[freqId2].hierarchy[hCount2-1-i]) != 0)
+ break;
+ }
+ //printf("The common ancestor of freqCS %d and %d is at %d \n",freqId1,
freqId2,i);
+ if (i !=0 && i != minCount){ /*There is a common ancestor at i */
+ level = 1;
+ tmpNode = tree;
+ while(level < i){
+ for (j = 0; j < tmpNode->numChildren; j++) {
+ if (strcmp(tmpNode->lstChildren[j]->uri,
labels[freqId1].hierarchy[hCount1-1-level]) == 0){
+ tmpNode = tmpNode->lstChildren[j];
+ break;
+ }
+ }
+ level++;
+ }
+ //printf("The common ancestor of freqCS %d and %d is: %s ---
Importance score: %f \n", freqId1, freqId2, tmpNode->uri, tmpNode->percentage);
+ if (tmpNode->percentage < 0.4) {
+ //printf("Merge two CS's using the common ancestor \n");
+ return 1;
+ }
+
+ }
+
+
return 0;
}
static
-void mergeMaximumFreqCSsAll(CSset *freqCSset, CSlabel* labels, oid*
superCSFreqCSMap, int numMaxCSs, oid *mergecsId){
+void mergeMaximumFreqCSsAll(CSset *freqCSset, CSlabel* labels, oid*
superCSFreqCSMap, int numMaxCSs, oid *mergecsId,OntoUsageNode *ontoUsageTree){
int i, j, k;
int freqId1, freqId2;
float simscore = 0.0;
@@ -2734,7 +2789,7 @@ void mergeMaximumFreqCSsAll(CSset *freqC
isSameLabel = 0;
#if USE_LABEL_FOR_MERGING
- if (isLabelComparable == 1 &&
isSemanticSimilar(freqId1, freqId2, labels) == 1){
+ if (isLabelComparable == 1 &&
isSemanticSimilar(freqId1, freqId2, labels, ontoUsageTree) == 1){
//printf("Same labels between freqCS %d and
freqCS %d - Old simscore is %f \n", freqId1, freqId2, simscore);
isSameLabel = 1;
simscore = 1;
@@ -3883,7 +3938,7 @@ RDFextractCSwithTypes(int *ret, bat *sba
printf("Using ontologies with %d ontattributesCount and %d
ontmetadataCount \n",ontattributesCount,ontmetadataCount);
(*labels) = createLabels(freqCSset, csrelSet, freqCSset->numCSadded,
sbat, si, pi, oi, *subjCSMap, mbat, csIdFreqIdxMap, ontattributes,
ontattributesCount, ontmetadata, ontmetadataCount, &ontoUsageTree);
-
+
curT = clock();
printf("Done labeling!!! Took %f seconds.\n", ((float)(curT -
tmpLastT))/CLOCKS_PER_SEC);
tmpLastT = curT;
@@ -3929,7 +3984,7 @@ RDFextractCSwithTypes(int *ret, bat *sba
tmpLastT = curT;
/* S1, S2, S3, S5 */
- mergeMaximumFreqCSsAll(freqCSset, *labels, superCSFreqCSMap, numMaxCSs,
&mergecsId);
+ mergeMaximumFreqCSsAll(freqCSset, *labels, superCSFreqCSMap, numMaxCSs,
&mergecsId, ontoUsageTree);
curT = clock();
diff --git a/monetdb5/extras/rdf/rdfschema.h b/monetdb5/extras/rdf/rdfschema.h
--- a/monetdb5/extras/rdf/rdfschema.h
+++ b/monetdb5/extras/rdf/rdfschema.h
@@ -166,6 +166,7 @@ typedef struct SubCSSet{
#define INIT_NUM_CS 100
#define SIM_THRESHOLD 0.6
#define SIM_TFIDF_THRESHOLD 0.55
+#define IMPORTANCE_THRESHOLD 0.4
typedef struct CSset{
CS* items;
_______________________________________________
checkin-list mailing list
[email protected]
http://mail.monetdb.org/mailman/listinfo/checkin-list