Changeset: 76e64ee7921f for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=76e64ee7921f
Modified Files:
        monetdb5/extras/rdf/rdfschema.c
        monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:

Check the number of subject having no discriminating prop in final table.


diffs (140 lines):

diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -3237,12 +3237,9 @@ PropStat* getPropStatisticsByTable(int n
                        propStat->maxNumPPerCS = cs.numProp;
        }
 
-       /* Do not calculate the TFIDF score. May need in the future  
-        *  
        for (i = 0; i < propStat->numAdded; i++){
-               propStat->tfidfs[i] = tfidfComp(propStat->freqs[i],numMaxCSs);
-       }
-       */
+               propStat->tfidfs[i] = tfidfComp(propStat->freqs[i],numTables);
+       }
 
        *numdistinctMCS = k; 
 
@@ -5261,7 +5258,6 @@ str RDFassignCSId(int *ret, BAT *sbat, B
        return MAL_SUCCEED; 
 }
 
-
 static 
 str RDFgetRefCounts(int *ret, BAT *sbat, BATiter si, BATiter pi, BATiter oi, 
oid *subjCSMap, int maxNumProp, BUN maxSoid, int *refCount){
 
@@ -7904,6 +7900,18 @@ void computeMetricsQForRefinedTable(CSse
        int     tmpNumFreqProps;
        int     *numRefinedFills = NULL;
        int     *numRefinedSupport = NULL;
+       #if NO_OUTPUTFILE == 0  
+       PropStat *propStat = NULL;      
+       int     numdistinctMCS = 0;
+       int     numSubjWithoutDiscProp = 0;
+       int     numTriplesWihtoutDiscProp = 0;
+       char    isContainedDiscProp = 0;
+       oid     p;
+       oid     *pbt;   
+       BUN     bun;
+       FILE    *fout; 
+       char    filename[100];
+       #endif
 
        fillRatio = (float*)malloc(sizeof(float) * numTables);
        refRatio = (float*)malloc(sizeof(float) * numTables);
@@ -7917,6 +7925,25 @@ void computeMetricsQForRefinedTable(CSse
                numRefinedSupport[i] =  
freqCSset->items[mTblIdxFreqIdxMapping[i]].support; 
        }
        
+       
+       #if NO_OUTPUTFILE == 0
+       
+       propStat = getPropStatisticsByTable(numTables, mTblIdxFreqIdxMapping, 
freqCSset,  &numdistinctMCS);
+       //Print the TF-IDF score of each prop in each table
+       
+       strcpy(filename,"propStatWithFinalSchema.txt");
+       fout = fopen(filename,"wt"); 
+       fprintf(fout, "PropertyOid #ofCSs tfidfscore"); 
+       for (i = 0; i < propStat->numAdded; i++){
+               pbt = (oid *) Tloc(propStat->pBat, i);
+               fprintf(fout, BUNFMT "  %d      %f \n", *pbt, 
propStat->plCSidx[i].numAdded,propStat->tfidfs[i]);
+       }
+       fclose(fout);
+       #endif
+
+       
+
+
        //Removing LOTSOFNULL_SUBJECT_THRESHOLD 
        //Check which freqCS having small number of prop
        //--> they will be removed from the final table.
@@ -7961,7 +7988,38 @@ void computeMetricsQForRefinedTable(CSse
                                
 
                }
-       }
+               
+               #if NO_OUTPUTFILE == 0
+               //Get the number of subject having no discriminating props in 
final Table
+               cs = freqCSset->items[i];
+               
+               isContainedDiscProp = 0;
+               for (j = 0; j < cs.numProp; j++){
+                       p = cs.lstProp[j]; 
+                       bun = BUNfnd(BATmirror(propStat->pBat),(ptr) &p);
+                       if (bun == BUN_NONE) {
+                               printf("FreqCS: %d, prop "BUNFMT" --> This prop 
must be in propStat!!!!\n",i,p);
+                       }
+                       else{
+                                if (propStat->tfidfs[bun] > 
MIN_TFIDF_PROP_FINALTABLE) {
+                                       isContainedDiscProp = 1;
+                                       break;
+                               }
+                       }
+               }
+               if (isContainedDiscProp == 0){  //There is no discriminating 
prop in this CS    
+                       numSubjWithoutDiscProp += cs.support;
+                       numTriplesWihtoutDiscProp += cs.coverage;
+               }
+       
+
+               #endif
+       }
+       
+       #if NO_OUTPUTFILE == 0
+       printf("Number of Subject having no discriminating props is: 
%d\n",numSubjWithoutDiscProp);
+       printf(" ==> Removing these subject will remove %d triples 
\n",numTriplesWihtoutDiscProp);
+       #endif  
        
        for (i = 0; i < numTables; i++){
                tmpFinalFreqIdx = mTblIdxFreqIdxMapping[i];
@@ -7995,6 +8053,9 @@ void computeMetricsQForRefinedTable(CSse
        free(weight); 
        free(numRefinedFills);
        free(numRefinedSupport);
+       #if NO_OUTPUTFILE == 0
+       freePropStat(propStat);
+       #endif
 
 }
 #endif
diff --git a/monetdb5/extras/rdf/rdfschema.h b/monetdb5/extras/rdf/rdfschema.h
--- a/monetdb5/extras/rdf/rdfschema.h
+++ b/monetdb5/extras/rdf/rdfschema.h
@@ -251,6 +251,7 @@ typedef struct SubCSSet{
 #define MIN_TFIDF_PROP_S4 3.5  //  When we merge two CS's based on the 
tf-idf/consine similarity score, we want 
                                // to make sure that we do not merge two CS's 
that may have same set of really common properties
                                // such as type, description. They should have 
at least one discriminating prop in common. 
+#define MIN_TFIDF_PROP_FINALTABLE 2.5 //Discriminating prop is prop that 
appears in less than 10% of the table 
 
 //#define MIN_FROMTABLE_SIZE_S5 1              /* For example data */
 #define MINIMUM_TABLE_SIZE 10000   //The minimum number of triples coverred by 
a table (i.e., a final CS) 
@@ -264,6 +265,7 @@ typedef struct SubCSSet{
 #define REMOVE_INFREQ_PROP     1
 #define REMOVE_LOTSOFNULL_SUBJECT      1
 #define        LOTSOFNULL_SUBJECT_THRESHOLD    0.1
+#define DETECT_INCORRECT_TYPE_SUBJECT  1       //Detect subjects that are 
assigned wrong type 
 
 #define        MIN_FK_FREQUENCY        0.1     // The frequency of a FK should 
be > MIN_FK_FREQUENCY * The frequency of a mergedCS (or the number of tuples in 
one table)      
 #define MIN_FK_PROPCOVERAGE    0.9     // The FK needs to happen in 
MIN_FK_PROPCOVERAGE of all instances of the particular property
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to