Changeset: a0bcec66e6b1 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=a0bcec66e6b1
Modified Files:
monetdb5/extras/rdf/rdfschema.c
monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:
Improve S4: Merging CS's if they have at least one discriminating prop in
common.
diffs (238 lines):
diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -2718,7 +2718,7 @@ float similarityScore(oid* arr1, oid* ar
/*Using cosine similarity score with vector of tf-idfs for properties in each
CS */
static
float similarityScoreTFIDF(oid* arr1, oid* arr2, int m, int n, int
*numCombineP,
- TFIDFInfo *tfidfInfos, int mergeCSId1, int mergeCSId2){
+ TFIDFInfo *tfidfInfos, int mergeCSId1, int mergeCSId2, char
*existDiscriminatingProp){
int i = 0, j = 0;
int numOverlap = 0;
@@ -2734,6 +2734,8 @@ float similarityScoreTFIDF(oid* arr1, oi
}
else if( arr1[j] == arr2[i] )
{
+ if (tfidfInfos[mergeCSId1].lsttfidfs[j] >
MIN_TFIDF_PROP_S4) *existDiscriminatingProp = 1;
+
sumXY += tfidfInfos[mergeCSId1].lsttfidfs[j] *
tfidfInfos[mergeCSId1].lsttfidfs[j];
j++;
i++;
@@ -4016,160 +4018,6 @@ void freeTFIDFInfo(TFIDFInfo *tfidfInfos
free(tfidfInfos);
}
-#if COMBINE_S2_S4
-static
-void mergeCSByS2S4(CSset *freqCSset, CSlabel** labels, oid* mergeCSFreqCSMap,
int curNumMergeCS, oid *mergecsId,OntoUsageNode *ontoUsageTree, oid
**ontmetadata, int ontmetadataCount){
- int i, j, k;
- int freqId1, freqId2;
- float simscore = 0.0;
- CS *mergecs;
- int existMergecsId;
- int numCombineP = 0;
- CS *cs1, *cs2;
- CS *existmergecs, *mergecs1, *mergecs2;
-
- PropStat *propStat; /* Store statistics about properties */
- char isLabelComparable = 0;
- char isSameLabel = 0;
- oid name; /* Name of the common ancestor */
- TFIDFInfo *tfidfInfos;
-
- (void) labels;
- (void) isLabelComparable;
-
-
- propStat = initPropStat();
- getPropStatisticsFromMergeCSs(propStat, curNumMergeCS,
mergeCSFreqCSMap, freqCSset); /*TODO: Get PropStat from MaxCSs or From mergedCS
only*/
- tfidfInfos = (TFIDFInfo*)malloc(sizeof(TFIDFInfo) * curNumMergeCS);
- initTFIDFInfos(tfidfInfos, curNumMergeCS, mergeCSFreqCSMap, freqCSset,
propStat);
-
-
- for (i = 0; i < curNumMergeCS; i++){
- freqId1 = mergeCSFreqCSMap[i];
- //printf("Label of %d CS is %s \n", freqId1,
(*labels)[freqId1].name);
- isLabelComparable = 0;
- if ((*labels)[freqId1].name != BUN_NONE) isLabelComparable = 1;
// no "DUMMY"
-
-
- #if NOT_MERGE_DIMENSIONCS
- if (freqCSset->items[freqId1].type == DIMENSIONCS) continue;
- #endif
- for (j = (i+1); j < curNumMergeCS; j++){
- cs1 = (CS*) &(freqCSset->items[freqId1]);
-
- freqId2 = mergeCSFreqCSMap[j];
- cs2 = (CS*) &(freqCSset->items[freqId2]);
- #if NOT_MERGE_DIMENSIONCS
- if (cs2->type == DIMENSIONCS) continue;
- #endif
- isSameLabel = 0;
-
- #if USE_LABEL_FOR_MERGING
- if (isLabelComparable == 1 &&
isSemanticSimilar(freqId1, freqId2, (*labels),
ontoUsageTree,freqCSset->numOrigFreqCS, &name) == 1){
- //printf("Same labels between freqCS %d and
freqCS %d - Old simscore is %f \n", freqId1, freqId2, simscore);
- isSameLabel = 1;
- simscore = 1;
- }
- #endif
-
- if (isSameLabel == 0){
- if(USINGTFIDF == 0){
- simscore =
similarityScore(cs1->lstProp, cs2->lstProp,
-
cs1->numProp,cs2->numProp,&numCombineP);
-
- //printf("simscore Jaccard = %f \n",
simscore);
- }
- else{
- simscore =
similarityScoreTFIDF(cs1->lstProp, cs2->lstProp,
-
cs1->numProp,cs2->numProp,&numCombineP, tfidfInfos, i, j);
- //printf(" Cosine = %f \n",
simscore);
-
- }
- }
-
- //simscore = 0.0;
- #if USINGTFIDF
- if (simscore > SIM_TFIDF_THRESHOLD){
- #else
- if (simscore > SIM_THRESHOLD) {
- #endif
- //printf("S4: merge freqCS %d and freqCS %d
(sim: %f)\n", freqId1, freqId2,simscore);
- //Check whether these CS's belong to any mergeCS
- if (cs1->parentFreqIdx == -1 &&
cs2->parentFreqIdx == -1){ /* New merge */
- mergecs = mergeTwoCSs(*cs1,*cs2,
freqId1,freqId2, *mergecsId);
- //addmergeCStoSet(mergecsSet, *mergecs);
- cs1->parentFreqIdx =
freqCSset->numCSadded;
- cs2->parentFreqIdx =
freqCSset->numCSadded;
- addCStoSet(freqCSset,*mergecs);
- if (isSameLabel) {
- // rule S2
- updateLabel(S2, freqCSset,
labels, 1, freqCSset->numCSadded - 1, freqId1, freqId2, name, ontmetadata,
ontmetadataCount, NULL, -1);
- } else {
- // rule S4
- updateLabel(S4, freqCSset,
labels, 1, freqCSset->numCSadded - 1, freqId1, freqId2, BUN_NONE, ontmetadata,
ontmetadataCount, NULL, -1);
- }
- free(mergecs);
-
- mergecsId[0]++;
-
-
- }
- else if (cs1->parentFreqIdx == -1 &&
cs2->parentFreqIdx != -1){
- existMergecsId = cs2->parentFreqIdx;
- existmergecs = (CS*)
&(freqCSset->items[existMergecsId]);
- mergeACStoExistingmergeCS(*cs1,freqId1,
existmergecs);
- cs1->parentFreqIdx = existMergecsId;
- if (isSameLabel) {
- // rule S2
- updateLabel(S2, freqCSset,
labels, 0, existMergecsId, freqId1, freqId2, name, ontmetadata,
ontmetadataCount, NULL, -1);
- } else {
- // rule S4
- updateLabel(S4, freqCSset,
labels, 0, existMergecsId, freqId1, freqId2, BUN_NONE, ontmetadata,
ontmetadataCount, NULL, -1);
- }
- }
-
- else if (cs1->parentFreqIdx != -1 &&
cs2->parentFreqIdx == -1){
- existMergecsId = cs1->parentFreqIdx;
- existmergecs =
(CS*)&(freqCSset->items[existMergecsId]);
- mergeACStoExistingmergeCS(*cs2,freqId2,
existmergecs);
- cs2->parentFreqIdx = existMergecsId;
- if (isSameLabel) {
- // rule S2
- updateLabel(S2, freqCSset,
labels, 0, existMergecsId, freqId1, freqId2, name, ontmetadata,
ontmetadataCount, NULL, -1);
- } else {
- // rule S4
- updateLabel(S4, freqCSset,
labels, 0, existMergecsId, freqId1, freqId2, BUN_NONE, ontmetadata,
ontmetadataCount, NULL, -1);
- }
- }
- else if (cs1->parentFreqIdx !=
cs2->parentFreqIdx){
- mergecs1 =
(CS*)&(freqCSset->items[cs1->parentFreqIdx]);
- mergecs2 =
(CS*)&(freqCSset->items[cs2->parentFreqIdx]);
-
- mergeTwomergeCS(mergecs1, mergecs2,
cs1->parentFreqIdx);
-
- //Re-map for all maxCS in mergecs2
- for (k = 0; k <
mergecs2->numConsistsOf; k++){
-
freqCSset->items[mergecs2->lstConsistsOf[k]].parentFreqIdx = cs1->parentFreqIdx;
- }
- if (isSameLabel) {
- // rule S2
- updateLabel(S2, freqCSset,
labels, 0, cs1->parentFreqIdx, freqId1, freqId2, name, ontmetadata,
ontmetadataCount, NULL, -1);
- } else {
- // rule S4
- updateLabel(S4, freqCSset,
labels, 0, cs1->parentFreqIdx, freqId1, freqId2, BUN_NONE, ontmetadata,
ontmetadataCount, NULL, -1);
- }
- }
- }
- }
- }
-
-
- freePropStat(propStat);
- freeTFIDFInfo(tfidfInfos, curNumMergeCS);
-
-}
-#endif //COMBINE_S2_S4
-
static
void mergeCSByS2(CSset *freqCSset, CSlabel** labels, oid* mergeCSFreqCSMap,
int curNumMergeCS, oid *mergecsId,OntoUsageNode *ontoUsageTree, oid
**ontmetadata, int ontmetadataCount, BAT *ontmetaBat, OntClass *ontclassSet){
int i, j;
@@ -4223,6 +4071,8 @@ void mergeCSByS4(CSset *freqCSset, CSlab
PropStat *propStat; /* Store statistics about properties */
TFIDFInfo *tfidfInfos;
+ char existDiscriminatingProp = 0;
+
/*
int ret;
char* schema = "rdf";
@@ -4255,6 +4105,8 @@ void mergeCSByS4(CSset *freqCSset, CSlab
#endif
if (cs1->parentFreqIdx != -1 && cs1->parentFreqIdx ==
cs2->parentFreqIdx) continue; //They have already been merged
+
+ existDiscriminatingProp = 0;
if(USINGTFIDF == 0){
simscore = similarityScore(cs1->lstProp,
cs2->lstProp,
@@ -4264,20 +4116,19 @@ void mergeCSByS4(CSset *freqCSset, CSlab
}
else{
simscore = similarityScoreTFIDF(cs1->lstProp,
cs2->lstProp,
- cs1->numProp,cs2->numProp,&numCombineP,
tfidfInfos, i, j);
+ cs1->numProp,cs2->numProp,&numCombineP,
tfidfInfos, i, j, &existDiscriminatingProp);
//printf(" Cosine = %f \n", simscore);
}
//simscore = 0.0;
#if USINGTFIDF
- if (simscore > SIM_TFIDF_THRESHOLD){
+ if (simscore > SIM_TFIDF_THRESHOLD &&
existDiscriminatingProp){
#else
if (simscore > SIM_THRESHOLD) {
#endif
- //printf(" Similarity score (%d and %d)
cosine = %f \n", freqId1,freqId2,simscore);
- /*
- if ((*labels)[freqId1].name != BUN_NONE){
+ /*
+ if ((*labels)[freqId1].name !=
BUN_NONE){
takeOid((*labels)[freqId1].name,
&freqCSname1);
printf("Merge %d (%s) and ",freqId1,
freqCSname1);
GDKfree(freqCSname1);
diff --git a/monetdb5/extras/rdf/rdfschema.h b/monetdb5/extras/rdf/rdfschema.h
--- a/monetdb5/extras/rdf/rdfschema.h
+++ b/monetdb5/extras/rdf/rdfschema.h
@@ -248,6 +248,9 @@ typedef struct SubCSSet{
// Number of references > (Frequency of
referredCS / MIN_TO_PERCETAGE_S5)
#define MIN_TFIDF_PROP_S5 3 // The prop for FK in S5 must not be a common
prop, it should be a discriminating one
// This is for preventing the case of webpageID
link in dbpedia
+#define MIN_TFIDF_PROP_S4 3.5 // When we merge two CS's based on the
tf-idf/consine similarity score, we want
+ // to make sure that we do not merge two CS's
that may have same set of really common properties
+ // such as type, description. They should have
at least one discriminating prop in common.
//#define MIN_FROMTABLE_SIZE_S5 1 /* For example data */
#define MINIMUM_TABLE_SIZE 10000 //The minimum number of triples coverred by
a table (i.e., a final CS)
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list