Changeset: 32687ef13872 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=32687ef13872
Modified Files:
monetdb5/extras/rdf/ontmetadata/loadOntologyToMonetDB.sh
monetdb5/extras/rdf/rdflabels.c
monetdb5/extras/rdf/rdfschema.c
monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:
Adjust an appropriate threshold for the importance score in merging CS's
diffs (143 lines):
diff --git a/monetdb5/extras/rdf/ontmetadata/loadOntologyToMonetDB.sh
b/monetdb5/extras/rdf/ontmetadata/loadOntologyToMonetDB.sh
--- a/monetdb5/extras/rdf/ontmetadata/loadOntologyToMonetDB.sh
+++ b/monetdb5/extras/rdf/ontmetadata/loadOntologyToMonetDB.sh
@@ -5,7 +5,7 @@ cp loadOntologySAMPLE.sql loadtmp.sql
sed -i "s:NUMMETADATA:$NUMMETADATA:g" loadtmp.sql
sed -i "s:NUMATTRIBUTES:$NUMATTRIBUTES:g" loadtmp.sql
sed -i "s:MetaFile:${PWD}/ontMetadata.dbpedia.csv:g" loadtmp.sql
-sed -i "s:AttFile:${PWD}/ontMetadata.dbpedia.csv:g" loadtmp.sql
+sed -i "s:AttFile:${PWD}/ontAttribute.dbpedia.csv:g" loadtmp.sql
@@ -21,7 +21,7 @@ cp loadOntologySAMPLE.sql loadtmp.sql
sed -i "s:NUMMETADATA:$NUMMETADATA:g" loadtmp.sql
sed -i "s:NUMATTRIBUTES:$NUMATTRIBUTES:g" loadtmp.sql
sed -i "s:MetaFile:${PWD}/ontMetadata.gr.csv:g" loadtmp.sql
-sed -i "s:AttFile:${PWD}/ontMetadata.gr.csv:g" loadtmp.sql
+sed -i "s:AttFile:${PWD}/ontAttribute.gr.csv:g" loadtmp.sql
mclient < loadtmp.sql
diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -2170,8 +2170,10 @@ void createOntoUsageTree(OntoUsageNode**
createOntoUsageTreeStatistics(*tree, numTuples);
// print
+ if(0){
printf("Ontology tree:\n");
printTree(*tree, 0);
+ }
}
static
diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -1329,7 +1329,7 @@ void mergeTwomergeCS(CS *mergecs1, CS *m
}
static
-str printFreqCSSet(CSset *freqCSset, BAT *freqBat, BAT *mapbat, char
isWriteTofile, int freqThreshold){
+str printFreqCSSet(CSset *freqCSset, BAT *freqBat, BAT *mapbat, char
isWriteTofile, int freqThreshold, CSlabel* labels){
int i;
int j;
@@ -1394,7 +1394,7 @@ str printFreqCSSet(CSset *freqCSset, BAT
if (cs.subject != BUN_NONE){
takeOid(cs.subject, &subStr);
- fprintf(fout,"CS " BUNFMT " (Freq: %d) |
Subject: %s | FreqParentIdx %d \n", cs.csId, *freq, subStr, cs.parentFreqIdx);
+ fprintf(fout,"CS " BUNFMT " - FreqId %d - Name:
%s (Freq: %d) | Subject: %s | FreqParentIdx %d \n", cs.csId, i,
labels[i].name, *freq, subStr, cs.parentFreqIdx);
// Filter max freq cs set
if (cs.type == MAXCS){
@@ -2674,6 +2674,7 @@ char isSemanticSimilar(int freqId1, int
int level;
OntoUsageNode *tmpNode;
+ //if(0){
if (strcmp(labels[freqId1].name, labels[freqId2].name) == 0)
return 1;
else{ /* Check top k candidates */
@@ -2690,13 +2691,14 @@ char isSemanticSimilar(int freqId1, int
}
}
}
-
+ //}
// Check for the most common ancestor
hCount1 = labels[freqId1].hierarchyCount;
hCount2 = labels[freqId2].hierarchyCount;
minCount = (hCount1 > hCount2)?hCount2:hCount1;
/*
+ if (minCount > 0){
printf("minCount = %d \n", minCount);
printf("Finding common ancestor for %d and %d \n", freqId1, freqId2 );
printf("FreqCS1: ");
@@ -2709,15 +2711,18 @@ char isSemanticSimilar(int freqId1, int
printf(" %s", labels[freqId2].hierarchy[hCount2-1-i]);
}
printf(" \n ");
+ }
+
*/
for (i = 0; i < minCount; i++){
if (strcmp(labels[freqId1].hierarchy[hCount1-1-i],
labels[freqId2].hierarchy[hCount2-1-i]) != 0)
break;
}
- //printf("The common ancestor of freqCS %d and %d is at %d \n",freqId1,
freqId2,i);
+
+ //printf("The common ancestor of freqCS %d and %d is at %d (minCount =
%d) \n",freqId1, freqId2,i, minCount);
if (i !=0 && i != minCount){ /*There is a common ancestor at i */
- level = 1;
+ level = 0;
tmpNode = tree;
while(level < i){
for (j = 0; j < tmpNode->numChildren; j++) {
@@ -2728,9 +2733,9 @@ char isSemanticSimilar(int freqId1, int
}
level++;
}
- //printf("The common ancestor of freqCS %d and %d is: %s ---
Importance score: %f \n", freqId1, freqId2, tmpNode->uri, tmpNode->percentage);
- if (tmpNode->percentage < 0.4) {
- //printf("Merge two CS's using the common ancestor \n");
+ //printf("The common ancestor of freqCS %d (%s) and freqCS %d
(%s) is: %s --- %f \n", freqId1, labels[freqId1].name, freqId2,
labels[freqId2].name, tmpNode->uri, tmpNode->percentage);
+ if (tmpNode->percentage < IMPORTANCE_THRESHOLD) {
+ //printf("Merge two CS's %s and %s using the common
ancestor (%s) at level %d (score:
%f)\n",labels[freqId1].name,labels[freqId2].name,tmpNode->uri,
i,tmpNode->percentage);
return 1;
}
@@ -3943,6 +3948,7 @@ RDFextractCSwithTypes(int *ret, bat *sba
printf("Done labeling!!! Took %f seconds.\n", ((float)(curT -
tmpLastT))/CLOCKS_PER_SEC);
tmpLastT = curT;
+
/*S4: Merge two CS's having the subset-superset relationship */
getMaximumFreqCSs(freqCSset, *labels, csBats->coverageBat,
csBats->freqBat, *maxCSoid + 1, &numMaxCSs);
@@ -3952,7 +3958,7 @@ RDFextractCSwithTypes(int *ret, bat *sba
//printf("Number of maximumCS: %d", numMaxCSs);
- printFreqCSSet(freqCSset, csBats->freqBat, mbat, 1, *freqThreshold);
+ printFreqCSSet(freqCSset, csBats->freqBat, mbat, 1, *freqThreshold,
*labels);
curT = clock();
diff --git a/monetdb5/extras/rdf/rdfschema.h b/monetdb5/extras/rdf/rdfschema.h
--- a/monetdb5/extras/rdf/rdfschema.h
+++ b/monetdb5/extras/rdf/rdfschema.h
@@ -166,7 +166,7 @@ typedef struct SubCSSet{
#define INIT_NUM_CS 100
#define SIM_THRESHOLD 0.6
#define SIM_TFIDF_THRESHOLD 0.55
-#define IMPORTANCE_THRESHOLD 0.4
+#define IMPORTANCE_THRESHOLD 0.01
typedef struct CSset{
CS* items;
_______________________________________________
checkin-list mailing list
[email protected]
http://mail.monetdb.org/mailman/listinfo/checkin-list