Changeset: e9c538d1f3ee for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=e9c538d1f3ee
Modified Files:
        monetdb5/extras/rdf/rdfschema.c
        monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:

Get the support for maximum CS.

Report the real number of triples coverred by each CS


diffs (truncated from 349 to 300 lines):

diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -84,6 +84,13 @@ static void initArray(oid* inputArr, int
 }
 
 
+static void initIntArray(int* inputArr, int num, oid defaultValue){
+       int i; 
+       for (i = 0; i < num; i++){
+               inputArr[i] = defaultValue;
+       }
+}
+
 static void initCharArray(char* inputArr, int num, char defaultValue){
        int i; 
        for (i = 0; i < num; i++){
@@ -994,8 +1001,9 @@ void testBatHash(void){
 */
 
 static 
-void addNewCS(CSBats *csBats, BUN* csKey, oid* key, oid *csoid, int num){
+void addNewCS(CSBats *csBats, BUN* csKey, oid* key, oid *csoid, int num, int 
numTriples){
        int freq = 1; 
+       int coverage = numTriples; 
        BUN     offset; 
 
        if (csBats->hsKeyBat->T->hash && BATcount(csBats->hsKeyBat) > 4 * 
csBats->hsKeyBat->T->hash->mask) {
@@ -1013,6 +1021,7 @@ void addNewCS(CSBats *csBats, BUN* csKey
        appendArrayToBat(csBats->fullPBat, key, num);
 
        BUNappend(csBats->freqBat, &freq, TRUE); 
+       BUNappend(csBats->coverageBat, &coverage, TRUE); 
 }
 /*
  * Put a CS to the hashmap. 
@@ -1023,16 +1032,17 @@ void addNewCS(CSBats *csBats, BUN* csKey
  * */
 #if STOREFULLCS
 static 
-oid putaCStoHash(CSBats *csBats, oid* key, int num, 
+oid putaCStoHash(CSBats *csBats, oid* key, int num, int numTriples,  
                oid *csoid, char isStoreFreqCS, int freqThreshold, CSset 
*freqCSset, oid subjectId, oid* buffObjs)
 #else
 static 
-oid putaCStoHash(CSBats *csBats, oid* key, int num, 
+oid putaCStoHash(CSBats *csBats, oid* key, int num, int numTriples, 
                oid *csoid, char isStoreFreqCS, int freqThreshold, CSset 
*freqCSset)
 #endif 
 {
        BUN     csKey; 
        int     *freq; 
+       oid     *coverage;      //Total number of triples coverred by this CS
        CS      *freqCS; 
        BUN     bun; 
        oid     csId;           /* Id of the characteristic set */
@@ -1042,7 +1052,7 @@ oid putaCStoHash(CSBats *csBats, oid* ke
        bun = BUNfnd(BATmirror(csBats->hsKeyBat),(ptr) &csKey);
        if (bun == BUN_NONE) {
                csId = *csoid; 
-               addNewCS(csBats, &csKey, key, csoid, num);
+               addNewCS(csBats, &csKey, key, csoid, num, numTriples);
                
                //Handle the case when freqThreshold == 1 
                if (isStoreFreqCS ==1 && freqThreshold == 1){
@@ -1063,7 +1073,7 @@ oid putaCStoHash(CSBats *csBats, oid* ke
                        //printf(" No duplication (new CS) \n");        
                        // New CS
                        csId = *csoid;
-                       addNewCS(csBats, &csKey, key, csoid, num);
+                       addNewCS(csBats, &csKey, key, csoid, num, numTriples);
                        
                        //Handle the case when freqThreshold == 1 
                        if (isStoreFreqCS ==1 && freqThreshold == 1){
@@ -1083,6 +1093,9 @@ oid putaCStoHash(CSBats *csBats, oid* ke
                        // Update freqCS value
                        freq = (int *)Tloc(csBats->freqBat, csId);
                        (*freq)++; 
+                       // Update number of coverred triples
+                       coverage = (oid *)Tloc(csBats->coverageBat, csId); 
+                       (*coverage) += numTriples;
 
                        if (isStoreFreqCS == 1){        /* Store the frequent 
CS to the CSset*/
                                //printf("FreqCS: Support = %d, Threshold %d  
\n ", freq, freqThreshold);
@@ -1154,13 +1167,14 @@ void printCS(CS cs){
  * Here maximum frequent CS is a CS that there exist no other CS which 
contains that CS
  * */
 static 
-void getMaximumFreqCSs(CSset *freqCSset, oid* csSuperCSMap, int numCS){
+void getMaximumFreqCSs(CSset *freqCSset, oid* csSuperCSMap, BAT* coverageBat, 
int* superCSCoverage, int numCS){
 
        int     numFreqCS = freqCSset->numCSadded; 
        int     i, j; 
        int     numMaxCSs = 0;
 
        oid     tmpCSId; 
+       int*    coverage; 
 
        printf("Retrieving maximum frequent CSs: \n");
 
@@ -1212,6 +1226,14 @@ void getMaximumFreqCSs(CSset *freqCSset,
                }
        }
 
+       // Update coverage for maximum CS
+       
+       for (i = 0; i < numFreqCS; i++){
+               tmpCSId = freqCSset->items[i].csId; 
+               coverage = (int*) Tloc(coverageBat, tmpCSId); 
+               superCSCoverage[tmpCSId] += *coverage;
+       }
+
        /*
        printf("CS - SuperCS after tunning ");
        for (i = 0; i < numCS; i++){
@@ -1286,7 +1308,57 @@ static void getStatisticCSsBySize(map_t 
 */
 
 
-static void getStatisticCSsBySupports(BAT *pOffsetBat, BAT *freqBat, BAT 
*fullPBat, oid* csSuperCSMap, char isWriteToFile, int freqThreshold){
+static void getStatisticCSsBySupports(BAT *pOffsetBat, BAT *freqBat, BAT 
*coverageBat, BAT *fullPBat, oid* csSuperCSMap, char isWriteToFile, int 
freqThreshold){
+
+       //int   *csPropNum; 
+       //int   *csFreq; 
+       FILE    *fout; 
+       oid     *offset, *offset2; 
+       int     numP; 
+       BUN     p, q; 
+       BATiter pi, freqi, coveri;
+       int     *freq, *coverage; 
+       char    filename[100];
+       char    tmpStr[20];
+
+       strcpy(filename, "csStatistic");
+       sprintf(tmpStr, "%d", freqThreshold);
+       strcat(filename, tmpStr);
+       strcat(filename, ".txt");
+
+       fout = fopen(filename,"wt"); 
+       fprintf(fout, " csId  #Prop   #frequency #coverage maxCSid\n"); 
+
+       pi = bat_iterator(pOffsetBat);
+       freqi = bat_iterator(freqBat);
+       coveri = bat_iterator(coverageBat); 
+
+       BATloop(pOffsetBat, p, q){
+               offset = (oid *) BUNtloc(pi, p);                
+
+               if ((p+1) != BUNlast(pOffsetBat)){
+                       offset2 = (oid *)BUNtloc(pi, p + 1);
+                       numP = *offset2 - *offset;
+               }
+               else    //Last element
+                       numP = BUNlast(fullPBat) - *offset;
+
+               freq = (int *) BUNtloc(freqi, p); 
+               coverage = (int *) BUNtloc(coveri, p); 
+
+               // Output the result 
+               if (isWriteToFile == 0)
+                       printf(BUNFMT "  %d  %d %d " BUNFMT "\n", p, numP, 
*freq, *coverage, csSuperCSMap[p]); 
+               else 
+                       fprintf(fout, BUNFMT " %d  %d %d " BUNFMT "\n", p, 
numP, *freq, *coverage, csSuperCSMap[p]); 
+       }
+
+       fclose(fout); 
+       //free(csPropNum); 
+}
+
+
+static void getStatisticMaxCSs(BAT *pOffsetBat, BAT *freqBat, BAT *fullPBat, 
oid* csSuperCSMap, int* superCSCoverage, char isWriteToFile, int freqThreshold){
 
        //int   *csPropNum; 
        //int   *csFreq; 
@@ -1299,13 +1371,15 @@ static void getStatisticCSsBySupports(BA
        char    filename[100];
        char    tmpStr[20];
 
-       strcpy(filename, "csStatistic");
+       printf("Get statistics of Maximum CSs \n");
+
+       strcpy(filename, "maxCSStatistic");
        sprintf(tmpStr, "%d", freqThreshold);
        strcat(filename, tmpStr);
        strcat(filename, ".txt");
 
        fout = fopen(filename,"wt"); 
-       fprintf(fout, " csId  #Prop   #frequency maxCSid\n"); 
+       fprintf(fout, " csId  #Prop   #frequency maxCSid coverage\n"); 
 
        pi = bat_iterator(pOffsetBat);
        freqi = bat_iterator(freqBat);
@@ -1322,19 +1396,21 @@ static void getStatisticCSsBySupports(BA
 
                freq = (int *) BUNtloc(freqi, p); 
 
+               
+               if (csSuperCSMap[p] == p){              // Check whether it is 
a maximumCS
+                       // Output the result 
+                       if (isWriteToFile == 0)
+                               printf(BUNFMT "  %d  %d " BUNFMT  " %d\n", p, 
numP, *freq, csSuperCSMap[p], superCSCoverage[p]); 
+                       else 
+                               fprintf(fout, BUNFMT " %d  %d " BUNFMT  " 
%d\n", p, numP, *freq, csSuperCSMap[p], superCSCoverage[p]); 
 
-               // Output the result 
-               if (isWriteToFile == 0)
-                       printf(BUNFMT "  %d  %d " BUNFMT "\n", p, numP, *freq, 
csSuperCSMap[p]); 
-               else 
-                       fprintf(fout, BUNFMT " %d  %d " BUNFMT "\n", p, numP, 
*freq, csSuperCSMap[p]); 
+               }
        }
 
        fclose(fout); 
        //free(csPropNum); 
 }
 
-
 /*
  * Get the refer CS 
  * Input: oid of a URI object 
@@ -1420,6 +1496,12 @@ CSBats* initCSBats(void){
                return NULL; 
        }
 
+       csBats->coverageBat = BATnew(TYPE_void, TYPE_int, smallbatsz);
+       
+       if (csBats->coverageBat == NULL) {
+               return NULL; 
+       }
+
        return csBats; 
 }
 
@@ -1430,6 +1512,7 @@ void freeCSBats(CSBats *csBats){
        BBPreclaim(csBats->hsKeyBat); 
        BBPreclaim(csBats->hsValueBat); 
        BBPreclaim(csBats->freqBat); 
+       BBPreclaim(csBats->coverageBat); 
        BBPreclaim(csBats->pOffsetBat); 
        BBPreclaim(csBats->fullPBat); 
 
@@ -1456,7 +1539,7 @@ str RDFassignCSId(int *ret, BAT *sbat, B
        oid*    _tmp;
        int     INIT_PROPERTY_NUM = 100; 
        oid     returnCSid; 
-       
+
        #if STOREFULLCS
        oid     *obt; 
        oid*    buffObjs;
@@ -1478,9 +1561,9 @@ str RDFassignCSId(int *ret, BAT *sbat, B
                if (*sbt != curS){
                        if (p != 0){    /* Not the first S */
                                #if STOREFULLCS
-                               returnCSid = putaCStoHash(csBats, buff, numP, 
&CSoid, 1, *freqThreshold, freqCSset, curS, buffObjs); 
+                               returnCSid = putaCStoHash(csBats, buff, numP, 
numPwithDup, &CSoid, 1, *freqThreshold, freqCSset, curS, buffObjs); 
                                #else
-                               returnCSid = putaCStoHash(csBats, buff, numP, 
&CSoid, 1, *freqThreshold, freqCSset); 
+                               returnCSid = putaCStoHash(csBats, buff, numP, 
numPwithDup, &CSoid, 1, *freqThreshold, freqCSset); 
                                #endif
 
                                subjCSMap[curS] = returnCSid;                   
@@ -1541,9 +1624,9 @@ str RDFassignCSId(int *ret, BAT *sbat, B
        
        /*put the last CS */
        #if STOREFULLCS
-       returnCSid = putaCStoHash(csBats, buff, numP, &CSoid, 1, 
*freqThreshold, freqCSset, curS, buffObjs); 
+       returnCSid = putaCStoHash(csBats, buff, numP, numPwithDup, &CSoid, 1, 
*freqThreshold, freqCSset, curS, buffObjs); 
        #else
-       returnCSid = putaCStoHash(csBats, buff, numP, &CSoid, 1, 
*freqThreshold, freqCSset ); 
+       returnCSid = putaCStoHash(csBats, buff, numP, numPwithDup, &CSoid, 1, 
*freqThreshold, freqCSset ); 
        #endif
        
        subjCSMap[curS] = returnCSid;                   
@@ -1650,6 +1733,8 @@ RDFextractCSwithTypes(int *ret, bat *sba
        CSrel           *csrelBetweenMaxFreqSet; 
        SubCSSet        *csSubCSMap; 
        oid             *csSuperCSMap;  
+       int             *superCSCoverage;  /* Store the number of triples 
coverred by each superCS 
+                                             This array will have many NULL 
values  */
 
        if ((sbat = BATdescriptor(*sbatid)) == NULL) {
                throw(MAL, "rdf.RDFextractCSwithTypes", RUNTIME_OBJECT_MISSING);
@@ -1708,6 +1793,8 @@ RDFextractCSwithTypes(int *ret, bat *sba
        csSuperCSMap = (oid*) malloc(sizeof(oid) * (maxCSoid + 1));
        initArray(csSuperCSMap, maxCSoid + 1, BUN_NONE);
 
+       superCSCoverage = (int*) malloc(sizeof(int) * (maxCSoid + 1));
+       initIntArray(superCSCoverage, maxCSoid + 1, 0);
 
        generateFreqCSMap(freqCSset,csFreqMap); 
 
@@ -1730,7 +1817,7 @@ RDFextractCSwithTypes(int *ret, bat *sba
 
        //getTopFreqCSs(csMap,*freqThreshold);
 
-       getMaximumFreqCSs(freqCSset, csSuperCSMap, maxCSoid + 1); 
+       getMaximumFreqCSs(freqCSset, csSuperCSMap, csBats->coverageBat, 
superCSCoverage, maxCSoid + 1); 
 
        printFreqCSSet(freqCSset, csSuperCSMap, csBats->freqBat, mbat, 1, 
*freqThreshold); 
_______________________________________________
checkin-list mailing list
[email protected]
http://mail.monetdb.org/mailman/listinfo/checkin-list

Reply via email to