Changeset: 953fd23b3505 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=953fd23b3505
Modified Files:
monetdb5/extras/rdf/rdf_shredder.c
monetdb5/extras/rdf/rdfschema.c
monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:
Use lables for finding maxCS and fix problems in using label in merging process.
diffs (151 lines):
diff --git a/monetdb5/extras/rdf/rdf_shredder.c
b/monetdb5/extras/rdf/rdf_shredder.c
--- a/monetdb5/extras/rdf/rdf_shredder.c
+++ b/monetdb5/extras/rdf/rdf_shredder.c
@@ -359,7 +359,7 @@ tripleHandler(void* user_data, const rap
} else if (triple->object->type == RAPTOR_TERM_TYPE_LITERAL) {
unsigned char* objStr;
- ObjectType objType;
+ ObjectType objType = STRING;
objStr = raptor_term_to_string(triple->object);
objType = getObjectType(objStr, &realNumValue);
diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -1853,19 +1853,31 @@ void getMaximumFreqCSs(CSset *freqCSset,
int tmpParentIdx;
int* coverage;
int* freq;
+ char isLabelComparable = 0;
(void) labels;
+ (void) isLabelComparable;
printf("Retrieving maximum frequent CSs: \n");
for (i = 0; i < numFreqCS; i++){
if (freqCSset->items[i].parentFreqIdx != -1) continue;
+ isLabelComparable = 0;
+ if (strcmp(labels[i].name, "DUMMY") != 0) isLabelComparable = 1;
+
for (j = (i+1); j < numFreqCS; j++){
if (freqCSset->items[j].numProp >
freqCSset->items[i].numProp){
if (isSubset(freqCSset->items[j].lstProp,
freqCSset->items[i].lstProp,
freqCSset->items[j].numProp,freqCSset->items[i].numProp) == 1) {
/* CSj is a superset of CSi */
+ #if USE_LABEL_FINDING_MAXCS
+ if (isLabelComparable == 1 &&
strcmp(labels[i].name, labels[j].name) == 0) {
+
freqCSset->items[i].parentFreqIdx = j;
+ break;
+ }
+ #else
freqCSset->items[i].parentFreqIdx = j;
+ #endif
break;
}
}
@@ -1873,7 +1885,13 @@ void getMaximumFreqCSs(CSset *freqCSset,
if (isSubset(freqCSset->items[i].lstProp,
freqCSset->items[j].lstProp,
freqCSset->items[i].numProp,freqCSset->items[j].numProp) == 1) {
/* CSj is a subset of CSi */
+ #if USE_LABEL_FINDING_MAXCS
+ if (isLabelComparable == 1 &&
strcmp(labels[i].name, labels[j].name) == 0) {
+
freqCSset->items[j].parentFreqIdx = i;
+ }
+ #else
freqCSset->items[j].parentFreqIdx = i;
+ #endif
}
}
@@ -2227,8 +2245,11 @@ void mergeMaximumFreqCSsAll(CSset *freqC
PropStat *propStat; /* Store statistics about properties */
int nummergedCSs = 0;
+ char isLabelComparable = 0;
+ char isSameLabel = 0;
(void) labels;
+ (void) isLabelComparable;
for (i = 0; i < freqCSset->numCSadded; i++){
if (freqCSset->items[i].parentFreqIdx == -1){
@@ -2248,30 +2269,38 @@ void mergeMaximumFreqCSsAll(CSset *freqC
for (i = 0; i < numMaxCSs; i++){
freqId1 = superCSFreqCSMap[i];
+ //printf("Label of %d CS is %s \n", freqId1,
labels[freqId1].name);
+ isLabelComparable = 0;
+ if (strcmp(labels[freqId1].name,"DUMMY") != 0)
isLabelComparable = 1;
+
cs1 = (CS*) &(freqCSset->items[freqId1]);
for (j = (i+1); j < numMaxCSs; j++){
freqId2 = superCSFreqCSMap[j];
cs2 = (CS*) &(freqCSset->items[freqId2]);
-
- if(USINGTFIDF == 0){
- simscore = similarityScore(cs1->lstProp,
cs2->lstProp,
- cs1->numProp,cs2->numProp,&numCombineP);
-
- //printf("simscore Jaccard = %f \n", simscore);
- }
- else{
- simscore = similarityScoreTFIDF(cs1->lstProp,
cs2->lstProp,
- cs1->numProp,cs2->numProp,&numCombineP,
propStat);
- //printf(" Cosine = %f \n", simscore);
-
- }
-
+ isSameLabel = 0;
+
#if USE_LABEL_FOR_MERGING
- if (strcmp(labels[freqId1].name, labels[freqId2].name)
== 0){
- //printf("Same labels between freqCS %d and
freqCS %d \n", freqId1, freqId2);
+ if (isLabelComparable == 1 &&
strcmp(labels[freqId1].name, labels[freqId2].name) == 0){
+ //printf("Same labels between freqCS %d and
freqCS %d - Old simscore is %f \n", freqId1, freqId2, simscore);
+ isSameLabel = 1;
simscore = 1;
}
#endif
+
+ if (isSameLabel == 0){
+ if(USINGTFIDF == 0){
+ simscore =
similarityScore(cs1->lstProp, cs2->lstProp,
+
cs1->numProp,cs2->numProp,&numCombineP);
+
+ //printf("simscore Jaccard = %f \n",
simscore);
+ }
+ else{
+ simscore =
similarityScoreTFIDF(cs1->lstProp, cs2->lstProp,
+
cs1->numProp,cs2->numProp,&numCombineP, propStat);
+ //printf(" Cosine = %f \n",
simscore);
+
+ }
+ }
//simscore = 0.0;
#if USINGTFIDF
@@ -3163,6 +3192,8 @@ RDFextractCSwithTypes(int *ret, bat *sba
// Create label per freqCS
csIdFreqIdxMap = (int *) malloc (sizeof(int) * (*maxCSoid + 1));
initcsIdFreqIdxMap(csIdFreqIdxMap, *maxCSoid + 1, -1, freqCSset);
+ printf("Using ontologies with %d ontattributesCount and %d
ontmetadataCount \n",ontattributesCount,ontmetadataCount);
+
labels = createLabels(freqCSset, csrelSet, *maxCSoid + 1, sbat, si, pi,
oi, *subjCSMap, mbat, csIdFreqIdxMap, *freqThreshold, ontattributes,
ontattributesCount, ontmetadata, ontmetadataCount);
diff --git a/monetdb5/extras/rdf/rdfschema.h b/monetdb5/extras/rdf/rdfschema.h
--- a/monetdb5/extras/rdf/rdfschema.h
+++ b/monetdb5/extras/rdf/rdfschema.h
@@ -88,7 +88,9 @@ typedef struct PropStat {
#define FULL_PROP_STAT 1 // Only use for showing the statistic on all
properties / all CSs. (Default should be 0)
-#define USE_LABEL_FOR_MERGING 1 // Use the labels received from
labeling process for finding maxCS and mergeCS
+
+#define USE_LABEL_FINDING_MAXCS 1 // Use the labels received from
labeling process for finding maxCS
+#define USE_LABEL_FOR_MERGING 1 // Use the labels received from
labeling process for finding mergeCS
typedef struct CS
{
_______________________________________________
checkin-list mailing list
[email protected]
http://mail.monetdb.org/mailman/listinfo/checkin-list