Changeset: c6f62542ceb7 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=c6f62542ceb7
Modified Files:
        monetdb5/extras/rdf/rdfschema.c
        monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:

Add more constraint for the case of dbpedia in using S5


diffs (114 lines):

diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -3237,13 +3237,15 @@ void freeCSrelSum(int maxNumProp, CSrelS
 }
 
 static 
-void generatecsRelSum(CSrel csRel, int freqId, CSset* freqCSset, CSrelSum 
*csRelSum){
+void generatecsRelSum(CSrel csRel, int freqId, CSset* freqCSset, CSrelSum 
*csRelSum, PropStat *propStat){
        int i; 
        int propIdx; 
        int refIdx; 
        int freq; 
        int referredFreqId;
        int freqOfReferredCS; 
+       oid p; 
+       BUN bun = BUN_NONE;
        
        csRelSum->origFreqIdx = freqId;
        csRelSum->numProp = freqCSset->items[freqId].numProp;
@@ -3259,25 +3261,33 @@ void generatecsRelSum(CSrel csRel, int f
                freqOfReferredCS = freqCSset->items[referredFreqId].support;
                if (freq > MIN_FROMTABLE_SIZE_S5 && freq < csRel.lstCnt[i] * 
MIN_PERCETAGE_S5 
                    && freqOfReferredCS < csRel.lstCnt[i] * 
MIN_TO_PERCETAGE_S5){                       
-                       propIdx = 0;
-                       while (csRelSum->lstPropId[propIdx] != 
csRel.lstPropId[i])
-                               propIdx++;
-               
-                       //Add to this prop
-                       refIdx = csRelSum->numPropRef[propIdx];
-                       csRelSum->freqIdList[propIdx][refIdx] = 
csRel.lstRefFreqIdx[i]; 
-                       csRelSum->numPropRef[propIdx]++;
-                       /*
-                       if (csRelSum->numPropRef[propIdx] >  1){
-                               int j;
-                               int toFreqId; 
-                               for (j = 0; j < csRelSum->numPropRef[propIdx]; 
j++){
-                                       toFreqId = 
csRelSum->freqIdList[propIdx][j];
-                                       printf(" FreqCS %d (freq: %d) ", 
toFreqId,freqCSset->items[toFreqId].support);
+                       
+                       p = csRel.lstPropId[i]; 
+                       bun = BUNfnd(BATmirror(propStat->pBat),(ptr) &p);
+                       assert(bun != BUN_NONE);
+                       printf("Prop " BUNFMT "Prop TFIDF score in S5 is %f 
\n",p, propStat->tfidfs[bun]); 
+                       if (propStat->tfidfs[bun] > MIN_TFIDF_PROP_S5){
+
+                               propIdx = 0;
+                               while (csRelSum->lstPropId[propIdx] != 
csRel.lstPropId[i])
+                                       propIdx++;
+                       
+                               //Add to this prop
+                               refIdx = csRelSum->numPropRef[propIdx];
+                               csRelSum->freqIdList[propIdx][refIdx] = 
csRel.lstRefFreqIdx[i]; 
+                               csRelSum->numPropRef[propIdx]++;
+                               
+                               if (csRelSum->numPropRef[propIdx] >  1){
+                                       int j;
+                                       int toFreqId; 
+                                       printf("Prop TFIDF score in S5 is %f 
\n",propStat->tfidfs[bun]); 
+                                       for (j = 0; j < 
csRelSum->numPropRef[propIdx]; j++){
+                                               toFreqId = 
csRelSum->freqIdList[propIdx][j];
+                                               printf(" FreqCS %d (freq: %d | 
coverage: %d) ", toFreqId,freqCSset->items[toFreqId].support, 
freqCSset->items[toFreqId].coverage);
+                                       }
+                                       printf("Will be merged with S5: Refer 
from freqCS %d (freq:%d | cov: %d) with prop "BUNFMT" --> numRef = %d \n", 
freqId,freq, freqCSset->items[freqId].coverage, 
csRelSum->lstPropId[propIdx],csRel.lstCnt[i]);
                                }
-                               printf("Will be merged with S5: Refer from 
freqCS %d (freq:%d) with prop "BUNFMT" --> numRef = %d \n", freqId,freq, 
csRelSum->lstPropId[propIdx],csRel.lstCnt[i]);
-                       }
-                       */      
+                       }
                }
        }
 
@@ -3744,7 +3754,11 @@ void mergeMaxFreqCSByS5(CSrel *csrelMerg
        #endif
        int             maxNumPropInMergeCS =0;
        //int           numCombinedP = 0; 
-       
+       PropStat        *propStat;      //This is for checking whether the prop 
of the FK is common prop or not
+
+       propStat = initPropStat();
+       getPropStatisticsFromMergeCSs(propStat, curNumMergeCS, 
mergeCSFreqCSMap, freqCSset);
+
        printf("Start merging CS by using S5[From FK] \n");
        
        #if NO_OUTPUTFILE == 0
@@ -3769,7 +3783,7 @@ void mergeMaxFreqCSByS5(CSrel *csrelMerg
        for (i = 0; i < curNumMergeCS; i++){
                freqId = mergeCSFreqCSMap[i];
                if (csrelMergeFreqSet[freqId].numRef != 0){
-                       generatecsRelSum(csrelMergeFreqSet[freqId], freqId, 
freqCSset, csRelSum);
+                       generatecsRelSum(csrelMergeFreqSet[freqId], freqId, 
freqCSset, csRelSum,propStat);
                        /* Check the number of */
                        #if NO_OUTPUTFILE == 0
                        fprintf(fout, "csRelSum " BUNFMT " (support: %d, 
coverage %d ): ",csRelSum->origFreqIdx, freqCSset->items[freqId].support, 
freqCSset->items[freqId].coverage);
@@ -3839,6 +3853,7 @@ void mergeMaxFreqCSByS5(CSrel *csrelMerg
 
        freeCSrelSum(maxNumPropInMergeCS, csRelSum);
 
+       freePropStat(propStat);
 }
 
 
diff --git a/monetdb5/extras/rdf/rdfschema.h b/monetdb5/extras/rdf/rdfschema.h
--- a/monetdb5/extras/rdf/rdfschema.h
+++ b/monetdb5/extras/rdf/rdfschema.h
@@ -246,6 +246,8 @@ typedef struct SubCSSet{
                                    // the CS's to-be-merged in this rule must 
cover > MIN_FROMTABLE_SIZE_S6 / MIN_PERCETAGE_S6 triples
 #define MIN_TO_PERCETAGE_S5 10 // Threshold for the number of instances in the 
target CS refered by the property
                                // Number of references > (Frequency of 
referredCS / MIN_TO_PERCETAGE_S5)
+#define MIN_TFIDF_PROP_S5 3    // The prop for FK in S5 must not be a common 
prop, it should be a discriminating one
+                               // This is for preventing the case of webpageID 
link in dbpedia 
 
 //#define MIN_FROMTABLE_SIZE_S5 1              /* For example data */
 #define MINIMUM_TABLE_SIZE 10000   //The minimum number of triples coverred by 
a table (i.e., a final CS) 
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to