Changeset: 8b1a1fc8fcb1 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=8b1a1fc8fcb1
Modified Files:
monetdb5/extras/rdf/rdfschema.c
monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:
Seperate rules S2 and S4
diffs (truncated from 309 to 300 lines):
diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -2732,7 +2732,7 @@ void updateParentIdxAll(CSset *freqCSset
* Here maximum frequent CS is a CS that there exist no other CS which
contains that CS
* */
static
-void mergeCSbyS4(CSset *freqCSset, CSlabel** labels, oid *mergeCSFreqCSMap,
int curNumMergeCS, oid **ontmetadata, int ontmetadataCount){
+void mergeCSbyS3(CSset *freqCSset, CSlabel** labels, oid *mergeCSFreqCSMap,
int curNumMergeCS, oid **ontmetadata, int ontmetadataCount){
int numMergeCS = curNumMergeCS;
int i, j;
@@ -3177,7 +3177,7 @@ void generatecsRelSum(CSrel csRel, int f
for (i = 0; i < csRel.numRef; i++){
freq = freqCSset->items[csRel.origFreqIdx].support;
- if (freq > MIN_FROMTABLE_SIZE_S6 && freq < csRel.lstCnt[i] *
MIN_PERCETAGE_S6){
+ if (freq > MIN_FROMTABLE_SIZE_S5 && freq < csRel.lstCnt[i] *
MIN_PERCETAGE_S5){
propIdx = 0;
while (csRelSum->lstPropId[propIdx] !=
csRel.lstPropId[i])
propIdx++;
@@ -3605,7 +3605,7 @@ str mergeMaxFreqCSByS1(CSset *freqCSset,
}
static
-void mergeMaxFreqCSByS6(CSrel *csrelMergeFreqSet, CSset *freqCSset, CSlabel**
labels, oid* mergeCSFreqCSMap, int curNumMergeCS, oid *mergecsId, oid**
ontmetadata, int ontmetadataCount){
+void mergeMaxFreqCSByS5(CSrel *csrelMergeFreqSet, CSset *freqCSset, CSlabel**
labels, oid* mergeCSFreqCSMap, int curNumMergeCS, oid *mergecsId, oid**
ontmetadata, int ontmetadataCount){
int i;
int freqId;
//int relId;
@@ -3632,7 +3632,7 @@ void mergeMaxFreqCSByS6(CSrel *csrelMerg
//int numCombinedP = 0;
int startIdx = 0;
- printf("Start merging CS by using S6 \n");
+ printf("Start merging CS by using S5[From FK] \n");
#if NO_OUTPUTFILE == 0
strcpy(filename, "csRelSum.txt");
@@ -3885,8 +3885,9 @@ void freeTFIDFInfo(TFIDFInfo *tfidfInfos
free(tfidfInfos);
}
+#if COMBINE_S2_S4
static
-void mergeCSByS3S5(CSset *freqCSset, CSlabel** labels, oid* mergeCSFreqCSMap,
int curNumMergeCS, oid *mergecsId,OntoUsageNode *ontoUsageTree, oid
**ontmetadata, int ontmetadataCount){
+void mergeCSByS2S4(CSset *freqCSset, CSlabel** labels, oid* mergeCSFreqCSMap,
int curNumMergeCS, oid *mergecsId,OntoUsageNode *ontoUsageTree, oid
**ontmetadata, int ontmetadataCount){
int i, j, k;
int freqId1, freqId2;
float simscore = 0.0;
@@ -4038,8 +4039,114 @@ void mergeCSByS3S5(CSset *freqCSset, CSl
freeTFIDFInfo(tfidfInfos, curNumMergeCS);
}
-
-
+#endif //COMBINE_S2_S4
+
+static
+void mergeCSByS2(CSset *freqCSset, CSlabel** labels, oid* mergeCSFreqCSMap,
int curNumMergeCS, oid *mergecsId,OntoUsageNode *ontoUsageTree, oid
**ontmetadata, int ontmetadataCount){
+ int i, j;
+ int freqId1, freqId2;
+
+ char isLabelComparable = 0;
+ oid name; /* Name of the common ancestor */
+
+
+
+ (void) labels;
+ (void) isLabelComparable;
+
+
+
+ for (i = 0; i < curNumMergeCS; i++){
+ freqId1 = mergeCSFreqCSMap[i];
+
+ isLabelComparable = 0;
+ if ((*labels)[freqId1].name != BUN_NONE) isLabelComparable = 1;
// no "DUMMY"
+
+ #if NOT_MERGE_DIMENSIONCS
+ if (freqCSset->items[freqId1].type == DIMENSIONCS) continue;
+ #endif
+ for (j = (i+1); j < curNumMergeCS; j++){
+ freqId2 = mergeCSFreqCSMap[j];
+ #if NOT_MERGE_DIMENSIONCS
+ if (freqCSset->items[freqId2].type == DIMENSIONCS)
continue;
+ #endif
+
+ if (isLabelComparable == 1 &&
isSemanticSimilar(freqId1, freqId2, (*labels),
ontoUsageTree,freqCSset->numOrigFreqCS, &name) == 1){
+ //printf("Same labels between freqCS %d and
freqCS %d - Old simscore is %f \n", freqId1, freqId2, simscore);
+ doMerge(freqCSset, S2, freqId1, freqId2,
mergecsId, labels, ontmetadata, ontmetadataCount, name);
+ }
+
+ }
+ }
+
+}
+
+static
+void mergeCSByS4(CSset *freqCSset, CSlabel** labels, oid* mergeCSFreqCSMap,
int curNumMergeCS, oid *mergecsId,oid **ontmetadata, int ontmetadataCount){
+ int i, j;
+ int freqId1, freqId2;
+ float simscore = 0.0;
+ CS *cs1, *cs2;
+ int numCombineP = 0;
+
+ PropStat *propStat; /* Store statistics about properties */
+ TFIDFInfo *tfidfInfos;
+
+
+
+ (void) labels;
+
+ propStat = initPropStat();
+ getPropStatisticsFromMergeCSs(propStat, curNumMergeCS,
mergeCSFreqCSMap, freqCSset); /*TODO: Get PropStat from MaxCSs or From mergedCS
only*/
+ tfidfInfos = (TFIDFInfo*)malloc(sizeof(TFIDFInfo) * curNumMergeCS);
+ initTFIDFInfos(tfidfInfos, curNumMergeCS, mergeCSFreqCSMap, freqCSset,
propStat);
+
+
+ for (i = 0; i < curNumMergeCS; i++){
+ freqId1 = mergeCSFreqCSMap[i];
+ //printf("Label of %d CS is %s \n", freqId1,
(*labels)[freqId1].name);
+
+ #if NOT_MERGE_DIMENSIONCS
+ if (freqCSset->items[freqId1].type == DIMENSIONCS) continue;
+ #endif
+ for (j = (i+1); j < curNumMergeCS; j++){
+ cs1 = (CS*) &(freqCSset->items[freqId1]);
+
+ freqId2 = mergeCSFreqCSMap[j];
+ cs2 = (CS*) &(freqCSset->items[freqId2]);
+ #if NOT_MERGE_DIMENSIONCS
+ if (cs2->type == DIMENSIONCS) continue;
+ #endif
+
+ if(USINGTFIDF == 0){
+ simscore = similarityScore(cs1->lstProp,
cs2->lstProp,
+ cs1->numProp,cs2->numProp,&numCombineP);
+
+ //printf("simscore Jaccard = %f \n", simscore);
+ }
+ else{
+ simscore = similarityScoreTFIDF(cs1->lstProp,
cs2->lstProp,
+ cs1->numProp,cs2->numProp,&numCombineP,
tfidfInfos, i, j);
+ //printf(" Cosine = %f \n", simscore);
+
+ }
+
+ //simscore = 0.0;
+ #if USINGTFIDF
+ if (simscore > SIM_TFIDF_THRESHOLD){
+ #else
+ if (simscore > SIM_THRESHOLD) {
+ #endif
+ doMerge(freqCSset, S4, freqId1, freqId2,
mergecsId, labels, ontmetadata, ontmetadataCount, BUN_NONE);
+ }
+ }
+ }
+
+
+ freePropStat(propStat);
+ freeTFIDFInfo(tfidfInfos, curNumMergeCS);
+
+}
static void putPtoHash(map_t pmap, int key, oid *poid, int support){
oid *getPoid;
oid *putPoid;
@@ -7055,7 +7162,7 @@ RDFextractCSwithTypes(int *ret, bat *sba
printf("Before using rules: Number of freqCS is: %d \n",curNumMergeCS);
- /* ---------- S1, S2 ------- */
+ /* ---------- S1 ------- */
mergecsId = *maxCSoid + 1;
mergeMaxFreqCSByS1(freqCSset, labels, &mergecsId, ontmetadata,
ontmetadataCount); /*S1: Merge all freqCS's sharing top-3 candidates */
@@ -7075,13 +7182,13 @@ RDFextractCSwithTypes(int *ret, bat *sba
#endif
tmpLastT = curT;
- /* ---------- S4 ------- */
+ /* ---------- S3 ------- */
mergeCSFreqCSMap = (oid*) malloc(sizeof(oid) * curNumMergeCS);
initMergeCSFreqCSMap(freqCSset, mergeCSFreqCSMap);
if (0){
- /*S4: Merge two CS's having the subset-superset relationship */
- mergeCSbyS4(freqCSset, labels, mergeCSFreqCSMap,curNumMergeCS,
ontmetadata, ontmetadataCount);
+ /*S3: Merge two CS's having the subset-superset relationship */
+ mergeCSbyS3(freqCSset, labels, mergeCSFreqCSMap,curNumMergeCS,
ontmetadata, ontmetadataCount);
curNumMergeCS = countNumberMergeCS(freqCSset);
curT = clock();
@@ -7094,26 +7201,26 @@ RDFextractCSwithTypes(int *ret, bat *sba
tmpLastT = curT;
}
- /* ---------- S6 ------- */
+ /* ---------- S5 ------- */
free(mergeCSFreqCSMap);
mergeCSFreqCSMap = (oid*) malloc(sizeof(oid) * curNumMergeCS);
initMergeCSFreqCSMap(freqCSset, mergeCSFreqCSMap);
- /* S6: Merged CS referred from the same CS via the same property */
+ /* S5: Merged CS referred from the same CS via the same property */
if (0){
tmpCSrelToMergeCS = generateCsRelToMergeFreqSet(csrelSet, freqCSset);
tmpNumRel = freqCSset->numCSadded;
- mergeMaxFreqCSByS6(tmpCSrelToMergeCS, freqCSset, labels,
mergeCSFreqCSMap, curNumMergeCS, &mergecsId, ontmetadata, ontmetadataCount);
- //printf("DISABLE S6 (For Testing) \n");
+ mergeMaxFreqCSByS5(tmpCSrelToMergeCS, freqCSset, labels,
mergeCSFreqCSMap, curNumMergeCS, &mergecsId, ontmetadata, ontmetadataCount);
+ //printf("DISABLE S5 (For Testing) \n");
freeCSrelSet(tmpCSrelToMergeCS,tmpNumRel);
}
curNumMergeCS = countNumberMergeCS(freqCSset);
curT = clock();
- printf("Merging with S6 took %f. (Number of mergeCS: %d | NumconsistOf:
%d) \n", ((float)(curT - tmpLastT))/CLOCKS_PER_SEC, curNumMergeCS,
countNumberConsistOfCS(freqCSset));
+ printf("Merging with S5 took %f. (Number of mergeCS: %d | NumconsistOf:
%d) \n", ((float)(curT - tmpLastT))/CLOCKS_PER_SEC, curNumMergeCS,
countNumberConsistOfCS(freqCSset));
#if STORE_PERFORMANCE_METRIC_INFO
computeMetricsQ(freqCSset);
@@ -7121,17 +7228,40 @@ RDFextractCSwithTypes(int *ret, bat *sba
tmpLastT = curT;
- /* S3, S5 */
+ //S2: Common ancestor
free(mergeCSFreqCSMap);
mergeCSFreqCSMap = (oid*) malloc(sizeof(oid) * curNumMergeCS);
initMergeCSFreqCSMap(freqCSset, mergeCSFreqCSMap);
- mergeCSByS3S5(freqCSset, labels, mergeCSFreqCSMap, curNumMergeCS,
&mergecsId, ontoUsageTree, ontmetadata, ontmetadataCount);
- free(mergeCSFreqCSMap);
+ mergeCSByS2(freqCSset, labels, mergeCSFreqCSMap, curNumMergeCS,
&mergecsId, ontoUsageTree, ontmetadata, ontmetadataCount);
curNumMergeCS = countNumberMergeCS(freqCSset);
curT = clock();
- printf ("Merging with S3, S5 took %f. (Number of mergeCS: %d)
\n",((float)(curT - tmpLastT))/CLOCKS_PER_SEC, curNumMergeCS);
+ printf ("Merging with S2 took %f. (Number of mergeCS: %d)
\n",((float)(curT - tmpLastT))/CLOCKS_PER_SEC, curNumMergeCS);
+
+ #if NO_OUTPUTFILE == 0
+ printMergedFreqCSSet(freqCSset, mbat, 1, *freqThreshold, *labels, 4);
+ #endif
+
+ #if STORE_PERFORMANCE_METRIC_INFO
+ computeMetricsQ(freqCSset);
+ #endif
+
+ tmpLastT = curT;
+
+
+ //S4: TF/IDF similarity
+ free(mergeCSFreqCSMap);
+ mergeCSFreqCSMap = (oid*) malloc(sizeof(oid) * curNumMergeCS);
+ initMergeCSFreqCSMap(freqCSset, mergeCSFreqCSMap);
+
+ mergeCSByS4(freqCSset, labels, mergeCSFreqCSMap, curNumMergeCS,
&mergecsId, ontmetadata, ontmetadataCount);
+ free(mergeCSFreqCSMap);
+
+ curNumMergeCS = countNumberMergeCS(freqCSset);
+ curT = clock();
+ printf ("Merging with S4 took %f. (Number of mergeCS: %d)
\n",((float)(curT - tmpLastT))/CLOCKS_PER_SEC, curNumMergeCS);
+
#if NO_OUTPUTFILE == 0
printMergedFreqCSSet(freqCSset, mbat, 1, *freqThreshold, *labels, 5);
#endif
@@ -7142,12 +7272,11 @@ RDFextractCSwithTypes(int *ret, bat *sba
tmpLastT = curT;
+
updateParentIdxAll(freqCSset);
-
//Finally, re-create mergeFreqSet
-
*csRelMergeFreqSet = generateCsRelBetweenMergeFreqSet(csrelSet,
freqCSset);
#if NO_OUTPUTFILE == 0
diff --git a/monetdb5/extras/rdf/rdfschema.h b/monetdb5/extras/rdf/rdfschema.h
--- a/monetdb5/extras/rdf/rdfschema.h
+++ b/monetdb5/extras/rdf/rdfschema.h
@@ -93,6 +93,7 @@ typedef struct PropStat {
#define INIT_PROP_NUM 10
#define INIT_CS_PER_PROP 10
#define USINGTFIDF 1
+#define COMBINE_S2_S4 0
#define STOREFULLCS 1 /* Store full instance of a CS including the a
subject and list of predicates, objects.
Only use this for finding the name of the
table corresponding to that CS */
@@ -236,10 +237,10 @@ typedef struct SubCSSet{
#define SIM_THRESHOLD 0.6
#define SIM_TFIDF_THRESHOLD 0.55
#define IMPORTANCE_THRESHOLD 0.01
-#define MIN_PERCETAGE_S6 5 // Merge all CS refered by more than
1/MIN_PERCETAGE_S6 percent of a CS via one property
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list