Changeset: 688999844231 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=688999844231
Modified Files:
        monetdb5/extras/rdf/rdfschema.c
        monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:

Tune rule S6 and collect statistic on final CSs


diffs (199 lines):

diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -3450,39 +3450,74 @@ static void getStatisticCSsBySupports(BA
 }
 
 
-static void getStatisticMaxCSs(CSset *freqCSset, char isWriteToFile, int 
freqThreshold){
+static void getStatisticFinalCSs(CSset *freqCSset, BAT *sbat, int 
freqThreshold, int curNumMergeCS, oid* mergeCSFreqCSMap){
 
        //int   *csPropNum; 
        //int   *csFreq; 
        FILE    *fout; 
-       int     numFreqCS, i ; 
+       int     i ; 
        char    filename[100];
        char    tmpStr[20];
-
-       printf("Get statistics of Maximum CSs ....");
-
-       numFreqCS = freqCSset->numCSadded; 
-
-       strcpy(filename, "maxCSStatistic");
+       int     maxNumtriple; 
+       int     minNumtriple = INT_MAX; 
+       int     numMergeCS = 0; 
+       int     totalCoverage = 0; 
+       int     freqId; 
+
+       printf("Get statistics of final CSs ....");
+
+       strcpy(filename, "finalCSStatistic");
        sprintf(tmpStr, "%d", freqThreshold);
        strcat(filename, tmpStr);
        strcat(filename, ".txt");
 
        fout = fopen(filename,"wt"); 
-       fprintf(fout, " csId  #Prop   #frequency maxCSid coverage\n"); 
-
-       for (i = 0; i < numFreqCS; i++){
-               if (freqCSset->items[i].parentFreqIdx == -1){           // 
Check whether it is a maximumCS
+       fprintf(fout, " csId  #Prop   #frequency #coverage\n"); 
+
+       for (i = 0; i < curNumMergeCS; i++){
+               freqId = mergeCSFreqCSMap[i]; 
+               if (freqCSset->items[freqId].parentFreqIdx == -1){              
// Check whether it is a maximumCS
                        // Output the result 
-                       if (isWriteToFile == 0)
-                               printf(BUNFMT "  %d  %d  %d\n", 
freqCSset->items[i].csId, 
freqCSset->items[i].numProp,freqCSset->items[i].support, 
freqCSset->items[i].coverage); 
-                       else 
-                               fprintf(fout, BUNFMT " %d  %d  %d\n", 
freqCSset->items[i].csId, 
freqCSset->items[i].numProp,freqCSset->items[i].support, 
freqCSset->items[i].coverage); 
-
+                       fprintf(fout, BUNFMT " %d  %d  %d\n", 
freqCSset->items[freqId].csId, 
freqCSset->items[freqId].numProp,freqCSset->items[freqId].support, 
freqCSset->items[freqId].coverage); 
+                       if (freqCSset->items[freqId].coverage > maxNumtriple) 
maxNumtriple = freqCSset->items[freqId].coverage;
+                       if (freqCSset->items[freqId].coverage < minNumtriple) 
minNumtriple = freqCSset->items[freqId].coverage;
+                       
+                       totalCoverage += freqCSset->items[freqId].coverage;
+                       numMergeCS++;
                }
        }
-
+       
        fclose(fout); 
+       printf("\nTotal " BUNFMT " triples, coverred by final CSs: %d  (%f 
percent) \n", BATcount(sbat), totalCoverage, 100 * 
(float)(totalCoverage/BATcount(sbat)));
+       printf("Max number of triples coverred by one final CS: %d \n", 
maxNumtriple);
+       printf("Min number of triples coverred by one final CS: %d \n", 
minNumtriple);
+       printf("Avg number of triples coverred by one final CS: %f \n", 
(float)(totalCoverage/numMergeCS));
+
+       //Check if remove all the final CS covering less than 10000 triples
+       
+       totalCoverage = 0;
+       maxNumtriple = 0;
+       minNumtriple = INT_MAX;
+       numMergeCS = 0;
+
+       for (i = 0; i < curNumMergeCS; i++){
+               freqId = mergeCSFreqCSMap[i]; 
+               if (freqCSset->items[freqId].parentFreqIdx == -1 && 
freqCSset->items[freqId].coverage > MINIMUM_TABLE_SIZE){            // Check 
whether it is a maximumCS
+                       // Output the result 
+                       fprintf(fout, BUNFMT " %d  %d  %d\n", 
freqCSset->items[freqId].csId, 
freqCSset->items[freqId].numProp,freqCSset->items[freqId].support, 
freqCSset->items[freqId].coverage); 
+                       if (freqCSset->items[freqId].coverage > maxNumtriple) 
maxNumtriple = freqCSset->items[freqId].coverage;
+                       if (freqCSset->items[freqId].coverage < minNumtriple) 
minNumtriple = freqCSset->items[freqId].coverage;
+                       
+                       totalCoverage += freqCSset->items[freqId].coverage;
+                       numMergeCS++;
+               }
+       }
+       
+       printf("AFTER removing all the 'small' final CSs  ==> Only %d final CSs 
\n", numMergeCS);
+       printf("Total " BUNFMT " triples, coverred by final CSs: %d  (%f 
percent) \n", BATcount(sbat), totalCoverage, 100 * 
(float)(totalCoverage/BATcount(sbat)));
+       printf("Max number of triples coverred by one final CS: %d \n", 
maxNumtriple);
+       printf("Min number of triples coverred by one final CS: %d \n", 
minNumtriple);
+       printf("Avg number of triples coverred by one final CS: %f \n", 
(float)(totalCoverage/numMergeCS));
        //free(csPropNum); 
        printf("Done \n");
 }
@@ -4384,7 +4419,7 @@ str printSampleData(CSSample *csSample, 
 
 static 
 str RDFExtractSampleData(int *ret, BAT *sbat, BATiter si, BATiter pi, BATiter 
oi,  
-               oid *subjCSMap, int* csTblIdxMapping, int maxNumPwithDup, 
CSSample *csSample, BAT *tblCandBat){
+               oid *subjCSMap, int* csTblIdxMapping, int maxNumPwithDup, 
CSSample *csSample, BAT *tblCandBat, int numSampleTbl){
 
        BUN             p, q; 
        oid             *sbt = 0, *obt, *pbt;
@@ -4397,7 +4432,7 @@ str RDFExtractSampleData(int *ret, BAT *
        int             tblIdx; 
        BUN             sampleIdx = BUN_NONE; 
        int             totalInstance = 0; 
-       int             maxNumInstance = NUM_SAMPLE_INSTANCE * NUM_SAMPLETABLE;
+       int             maxNumInstance = NUM_SAMPLE_INSTANCE * numSampleTbl;
 
        (void) csSample; 
 
@@ -4832,7 +4867,6 @@ RDFextractCSwithTypes(int *ret, bat *sba
        //getStatisticCSsBySize(csMap,maxNumProp); 
 
        getStatisticCSsBySupports(csBats->pOffsetBat, csBats->freqBat, 
csBats->coverageBat, csBats->fullPBat, 1, *freqThreshold);
-       getStatisticMaxCSs(freqCSset, 1, *freqThreshold);
 
 
 
@@ -5529,13 +5563,17 @@ RDFreorganize(int *ret, CStableStat *cst
        CSlabel         *labels;
        CSrel           *csRelMergeFreqSet = NULL;
 
+       int             curNumMergeCS;
+       oid             *mergeCSFreqCSMap;
+       int             numSampleTbl = 0;  
+
        freqCSset = initCSset();
 
        if (RDFextractCSwithTypes(ret, sbatid, pbatid, obatid, mapbatid, 
freqThreshold, freqCSset,&subjCSMap, &maxCSoid, &maxNumPwithDup, &labels, 
&csRelMergeFreqSet) != MAL_SUCCEED){
                throw(RDF, "rdf.RDFreorganize", "Problem in extracting CSs");
        }
        
-       printf("Start re-organizing triple store for " BUNFMT " CSs \n", 
maxCSoid);
+       printf("Start re-organizing triple store for " BUNFMT " CSs \n", 
maxCSoid + 1);
 
        csTblIdxMapping = (int *) malloc (sizeof (int) * (maxCSoid + 1)); 
        initIntArray(csTblIdxMapping, (maxCSoid + 1), -1);
@@ -5585,12 +5623,17 @@ RDFreorganize(int *ret, CStableStat *cst
 
        // Init CStableStat
        initCStables(cstablestat, freqCSset, csPropTypes, numTables);
-
        
+       // Summarize the statistics
+       curNumMergeCS = countNumberMergeCS(freqCSset);
+       mergeCSFreqCSMap = (oid*) malloc(sizeof(oid) * curNumMergeCS);
+       initMergeCSFreqCSMap(freqCSset, mergeCSFreqCSMap);
+       getStatisticFinalCSs(freqCSset, sbat, *freqThreshold, curNumMergeCS, 
mergeCSFreqCSMap);
+       free(mergeCSFreqCSMap);
+
        /* Extract sample data for the evaluation */
        {       
-       int     curNumMergeCS;
-       oid     *mergeCSFreqCSMap;
+
        BAT     *outputBat;
        CSSample *csSample; 
 
@@ -5602,13 +5645,17 @@ RDFreorganize(int *ret, CStableStat *cst
        mergeCSFreqCSMap = (oid*) malloc(sizeof(oid) * curNumMergeCS);
        initMergeCSFreqCSMap(freqCSset, mergeCSFreqCSMap);
 
-       outputBat = generateTablesForEvaluating(freqCSset, NUM_SAMPLETABLE, 
mergeCSFreqCSMap, curNumMergeCS);
-       assert (BATcount(outputBat) == NUM_SAMPLETABLE);
-       csSample = (CSSample*)malloc(sizeof(CSSample) * NUM_SAMPLETABLE);
+       numSampleTbl = (NUM_SAMPLETABLE > 
(curNumMergeCS/2))?(curNumMergeCS/2):NUM_SAMPLETABLE;
+
+       printf("Select list of sample tables \n");
+       outputBat = generateTablesForEvaluating(freqCSset, numSampleTbl, 
mergeCSFreqCSMap, curNumMergeCS);
+       assert (BATcount(outputBat) == (oid) numSampleTbl);
+       csSample = (CSSample*)malloc(sizeof(CSSample) * numSampleTbl);
+       printf("Select sample instances for %d tables \n", numSampleTbl);
        initSampleData(csSample, outputBat, freqCSset, mergeCSFreqCSMap, 
labels);
-       RDFExtractSampleData(ret, sbat, si, pi, oi, subjCSMap, csTblIdxMapping, 
maxNumPwithDup, csSample, outputBat);
-       printSampleData(csSample, freqCSset, mbat, NUM_SAMPLETABLE);
-       freeSampleData(csSample, NUM_SAMPLETABLE);
+       RDFExtractSampleData(ret, sbat, si, pi, oi, subjCSMap, csTblIdxMapping, 
maxNumPwithDup, csSample, outputBat, numSampleTbl);
+       printSampleData(csSample, freqCSset, mbat, numSampleTbl);
+       freeSampleData(csSample, numSampleTbl);
        BBPreclaim(outputBat);
        BBPunfix(mbat->batCacheid);
        free(mergeCSFreqCSMap);
diff --git a/monetdb5/extras/rdf/rdfschema.h b/monetdb5/extras/rdf/rdfschema.h
--- a/monetdb5/extras/rdf/rdfschema.h
+++ b/monetdb5/extras/rdf/rdfschema.h
@@ -173,7 +173,8 @@ typedef struct SubCSSet{
 #define SIM_THRESHOLD 0.6
 #define SIM_TFIDF_THRESHOLD 0.55
 #define IMPORTANCE_THRESHOLD 0.01
-#define MIN_PERCETAGE_S6 10    // Merge all CS refered by more than 
1/MIN_PERCETAGE_S6 percent of a CS via one property
+#define MIN_PERCETAGE_S6 5     // Merge all CS refered by more than 
1/MIN_PERCETAGE_S6 percent of a CS via one property
+#define MINIMUM_TABLE_SIZE 10000   //The minimum number of triples coverred by 
a table (i.e., a final CS) 
 
 typedef struct CSset{
        CS* items;
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to