Changeset: 0170da4c1a0c for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=0170da4c1a0c
Modified Files:
        monetdb5/extras/rdf/rdfschema.c
        monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:

Merge maxCS's using rule S6.

This rule S6 allows merging all CS's referred by the same CS via a specific 
property.
It reduces quite a lot freqCS's.


diffs (truncated from 324 to 300 lines):

diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -2499,29 +2499,177 @@ void initSuperCSFreqCSMap(CSset *freqCSs
 }
 
 static
-void mergeMaxFreqCSByS6(CSrel *csrelBetweenMaxFreqSet, CSset *freqCSset, oid* 
superCSFreqCSMap, int numMaxCSs){
+CSrelSum* initCSrelSum(int maxNumProp, int maxNumRefPerCS){
        int i; 
-       int freqId1;
-       int relId; 
-       CS*     cs1;
-       for (i = 0; i < numMaxCSs; i++){
-               freqId1 = superCSFreqCSMap[i];
-               cs1 = (CS*) &freqCSset->items[freqId1];
-               relId = cs1->csId; 
-               if (csrelBetweenMaxFreqSet[relId].numRef != 0){
-                       continue;               
+       CSrelSum *csRelSum;     
+       csRelSum = (CSrelSum*)malloc(sizeof(CSrelSum)); 
+       csRelSum->origFreqIdx = -1;
+       csRelSum->numProp = 0;          /* Initially there is no prop */
+       csRelSum->lstPropId = (oid*)malloc(sizeof(oid) * maxNumProp); 
+       csRelSum->numPropRef = (int*)malloc(sizeof(int) * maxNumProp);          
+       csRelSum->freqIdList = (int**)malloc(sizeof(int*) * maxNumProp);
+       for (i = 0; i < maxNumProp; i++){
+               csRelSum->numPropRef[i] = 0;
+               csRelSum->freqIdList[i] = (int*)malloc(sizeof(int) * 
maxNumRefPerCS);
+       }       
+
+       return csRelSum;
+}
+
+static 
+void freeCSrelSum(int maxNumProp, CSrelSum *csRelSum){
+       int i; 
+       for (i = 0; i < maxNumProp; i++){
+               free(csRelSum->freqIdList[i]);
+       }
+       free(csRelSum->freqIdList);
+       free(csRelSum->numPropRef);
+       free(csRelSum->lstPropId);
+       free(csRelSum);
+}
+
+static 
+void generatecsRelSum(CSrel csRel, int freqId, CSset* freqCSset, CSrelSum 
*csRelSum){
+       int i; 
+       int propIdx; 
+       int refIdx; 
+       int freq; 
+
+       csRelSum->origFreqIdx = freqId;
+       csRelSum->numProp = freqCSset->items[freqId].numProp;
+       copyOidSet(csRelSum->lstPropId, freqCSset->items[freqId].lstProp, 
csRelSum->numProp);
+
+       for (i = 0; i < csRelSum->numProp; i++){
+               csRelSum->numPropRef[i] = 0;
+       }
+
+       for (i = 0; i < csRel.numRef; i++){
+               freq = freqCSset->items[csRel.origFreqIdx].support; 
+               if (freq < csRel.lstCnt[i] * 100){                      
+                       propIdx = 0;
+                       while (csRelSum->lstPropId[propIdx] != 
csRel.lstPropId[i])
+                               propIdx++;
+               
+                       //Add to this prop
+                       refIdx = csRelSum->numPropRef[propIdx];
+                       csRelSum->freqIdList[propIdx][refIdx] = 
csRel.lstRefFreqIdx[i]; 
+                       csRelSum->numPropRef[propIdx]++;
                }
        }
 
 }
 
 static
-void mergeMaximumFreqCSsAll(CSset *freqCSset, CSlabel* labels, oid* 
superCSFreqCSMap, oid* superCSMergeMaxCSMap, int numMaxCSs, oid maxCSoid){
+void mergeMaxFreqCSByS6(CSrel *csrelBetweenMaxFreqSet, CSset *freqCSset, oid* 
superCSFreqCSMap, int numMaxCSs, int maxNumProp, oid *mergecsId){
+       int             i; 
+       int             freqId, freqId1, freqId2;
+       //int           relId; 
+       //CS*           cs1;
+       CSrelSum        *csRelSum; 
+       int             maxNumRefPerCS = 0; 
+       int             j, k; 
+
+       CS              *mergecs;
+       oid             existMergecsId = BUN_NONE; 
+       CS              *cs1, *cs2;
+       CS              *existmergecs, *mergecs1, *mergecs2; 
+
+       char            filename[100];
+       FILE            *fout; 
+
+       strcpy(filename, "csRelSum.txt");
+
+       fout = fopen(filename,"wt"); 
+
+       for (i = 0; i < numMaxCSs; i++){
+               freqId = superCSFreqCSMap[i];
+               if (csrelBetweenMaxFreqSet[freqId].numRef > maxNumRefPerCS){
+                       maxNumRefPerCS = csrelBetweenMaxFreqSet[freqId].numRef 
;                
+               }
+       }
+       printf("maxNumRefPerCS = %d \n", maxNumRefPerCS);
+
+       csRelSum = initCSrelSum(maxNumProp,maxNumRefPerCS);
+       
+       for (i = 0; i < numMaxCSs; i++){
+               freqId = superCSFreqCSMap[i];
+               if (csrelBetweenMaxFreqSet[freqId].numRef != 0){
+                       generatecsRelSum(csrelBetweenMaxFreqSet[freqId], 
freqId, freqCSset, csRelSum);
+                       /* Check the number of */
+                       fprintf(fout, "csRelSum " BUNFMT ": 
",csRelSum->origFreqIdx);
+                       for (j = 0; j < csRelSum->numProp; j++){
+                               if ( csRelSum->numPropRef[j] > 1){
+                                       fprintf(fout, "  P " BUNFMT " 
-->",csRelSum->lstPropId[j]);
+                                       for (k = 0; k < 
csRelSum->numPropRef[j]; k++){
+                                               fprintf(fout, " %d | ", 
csRelSum->freqIdList[j][k]);
+                                       }       
+                                       /* Merge each refCS into the first CS. 
+                                        * TODO: The Multi-way merging should 
be better
+                                        * */ 
+                                       freqId1 = csRelSum->freqIdList[j][0];
+                                       cs1 = (CS*) 
&(freqCSset->items[freqId1]);
+                                       for (k = 1; k < 
csRelSum->numPropRef[j]; k++){
+                                               freqId2 = 
csRelSum->freqIdList[j][k];
+                                               cs2 = (CS*) 
&(freqCSset->items[freqId2]);
+                                               //Check whether these CS's 
belong to any mergeCS
+                                               if (cs1->parentFreqIdx == -1 && 
cs2->parentFreqIdx == -1){      /* New merge */
+                                                       mergecs = 
mergeTwoCSs(*cs1,*cs2, freqId1,freqId2, *mergecsId);
+                                                       
//addmergeCStoSet(mergecsSet, *mergecs);
+                                                       cs1->parentFreqIdx = 
freqCSset->numCSadded;
+                                                       cs2->parentFreqIdx = 
freqCSset->numCSadded;
+                                                       
addCStoSet(freqCSset,*mergecs);
+                                                       free(mergecs);
+
+                                                       mergecsId[0]++;
+                                               }
+                                               else if (cs1->parentFreqIdx == 
-1 && cs2->parentFreqIdx != -1){
+                                                       existMergecsId = 
cs2->parentFreqIdx;
+                                                       existmergecs = (CS*) 
&(freqCSset->items[existMergecsId]);
+                                                       
mergeACStoExistingmergeCS(*cs1,freqId1, existmergecs);
+                                                       cs1->parentFreqIdx = 
existMergecsId; 
+                                               }
+                                               
+                                               else if (cs1->parentFreqIdx != 
-1 && cs2->parentFreqIdx == -1){
+                                                       existMergecsId = 
cs1->parentFreqIdx;
+                                                       existmergecs = 
(CS*)&(freqCSset->items[existMergecsId]);
+                                                       
mergeACStoExistingmergeCS(*cs2,freqId2, existmergecs);
+                                                       cs2->parentFreqIdx = 
existMergecsId; 
+                                                       
//printamergeCS(mergecsSet->items[existMergecsId] ,existMergecsId, freqCSset, 
superCSFreqCSMap);
+                                               }
+                                               else if (cs1->parentFreqIdx != 
cs2->parentFreqIdx){
+                                                       mergecs1 = 
(CS*)&(freqCSset->items[cs1->parentFreqIdx]);
+                                                       mergecs2 = 
(CS*)&(freqCSset->items[cs2->parentFreqIdx]);
+                                                       
+                                                       
mergeTwomergeCS(mergecs1, mergecs2, cs1->parentFreqIdx);
+
+                                                       //Re-map for all maxCS 
in mergecs2
+                                                       for (k = 0; k < 
mergecs2->numConsistsOf; k++){
+                                                               
freqCSset->items[mergecs2->lstConsistsOf[k]].parentFreqIdx = cs1->parentFreqIdx;
+                                                       }
+                                               }
+
+                                       }
+                               }
+                       }
+                       fprintf(fout, "\n");
+               }
+       }
+       
+
+
+       fclose(fout); 
+
+
+       freeCSrelSum(maxNumProp, csRelSum);
+
+}
+
+static
+void mergeMaximumFreqCSsAll(CSset *freqCSset, CSlabel* labels, oid* 
superCSFreqCSMap, int numMaxCSs, oid *mergecsId){
        int             i, j, k; 
        int             freqId1, freqId2; 
        float           simscore = 0.0; 
        CS              *mergecs;
-       oid             mercsId = 0; 
        oid             existMergecsId = BUN_NONE; 
        int             numCombineP = 0; 
        CS              *cs1, *cs2;
@@ -2531,20 +2679,26 @@ void mergeMaximumFreqCSsAll(CSset *freqC
        int             nummergedCSs = 0;
        char            isLabelComparable = 0; 
        char            isSameLabel = 0; 
+
+       int             numcurMergedCS;         
+       
+
        
        (void) labels;
        (void) isLabelComparable;
 
-
-
-       //Initial superCSMergeMaxCSMap
-       for (i = 0; i < numMaxCSs; i++){
-               superCSMergeMaxCSMap[i] = BUN_NONE; 
+       numcurMergedCS = 0;
+       for (i = 0; i < freqCSset->numCSadded; i++){
+               if (freqCSset->items[i].parentFreqIdx == -1)    
numcurMergedCS++;
        }
 
-       
+
+       printf("Number of freqCS added = %d \n",freqCSset->numCSadded);
+       printf("Number of freqCS after merging using S6: = %d 
\n",numcurMergedCS);
+
+
        propStat = initPropStat();
-       getPropStatisticsFromMaxCSs(propStat, numMaxCSs, superCSFreqCSMap, 
freqCSset);
+       getPropStatisticsFromMaxCSs(propStat, numMaxCSs, superCSFreqCSMap, 
freqCSset); /*TODO: Get PropStat from MaxCSs or From mergedCS only*/
 
        for (i = 0; i < numMaxCSs; i++){
                freqId1 = superCSFreqCSMap[i];
@@ -2588,16 +2742,16 @@ void mergeMaximumFreqCSsAll(CSset *freqC
                        if (simscore > SIM_THRESHOLD) {
                        #endif                          
                                //Check whether these CS's belong to any mergeCS
-                               
                                if (cs1->parentFreqIdx == -1 && 
cs2->parentFreqIdx == -1){      /* New merge */
-                                       mergecs = mergeTwoCSs(*cs1,*cs2, 
freqId1,freqId2, mercsId + maxCSoid);
+                                       mergecs = mergeTwoCSs(*cs1,*cs2, 
freqId1,freqId2, *mergecsId);
                                        //addmergeCStoSet(mergecsSet, *mergecs);
                                        cs1->parentFreqIdx = 
freqCSset->numCSadded;
                                        cs2->parentFreqIdx = 
freqCSset->numCSadded;
                                        addCStoSet(freqCSset,*mergecs);
                                        free(mergecs);
 
-                                       mercsId++;
+                                       mergecsId[0]++;
+
 
                                }
                                else if (cs1->parentFreqIdx == -1 && 
cs2->parentFreqIdx != -1){
@@ -3581,10 +3735,10 @@ RDFextractCSwithTypes(int *ret, bat *sba
        int             *refCount;      /* Count the number of references to 
each CS */
 
        int*            csIdFreqIdxMap; /* Map a CSId to a freqIdx. Should be 
removed in the future .... */
+       oid             mergecsId = 0; 
 
        int             numMaxCSs = 0; 
        oid             *superCSFreqCSMap; 
-       oid             *superCSMergeMaxCSMap;
        CSset           *freqCSset; 
        clock_t         curT;
        clock_t         tmpLastT; 
@@ -3736,15 +3890,26 @@ RDFextractCSwithTypes(int *ret, bat *sba
 
        printCSrelWithMaxSet(csrelToMaxFreqSet, csrelBetweenMaxFreqSet, 
freqCSset, freqCSset->numCSadded, *freqThreshold);  
 
+
+       curT = clock(); 
+       printf (" ----- Generate CSrel with max set took  %f seconds.\n", 
((float)(curT - tmpLastT))/CLOCKS_PER_SEC);
+       tmpLastT = curT;                
+
        superCSFreqCSMap = (oid*) malloc(sizeof(oid) * numMaxCSs); 
 
        initSuperCSFreqCSMap(freqCSset, superCSFreqCSMap);
+
+       mergecsId = *maxCSoid + 1; 
        /* S6: Merged CS referred from the same CS via the same property */
-       mergeMaxFreqCSByS6(csrelBetweenMaxFreqSet, freqCSset, superCSFreqCSMap, 
numMaxCSs);
-
-       superCSMergeMaxCSMap = (oid*) malloc(sizeof(oid) * numMaxCSs);
+       mergeMaxFreqCSByS6(csrelBetweenMaxFreqSet, freqCSset, superCSFreqCSMap, 
numMaxCSs, maxNumProp, &mergecsId);
+
+       curT = clock(); 
+       printf (" ----- MergeCS using S6 took  %f seconds.\n", ((float)(curT - 
tmpLastT))/CLOCKS_PER_SEC);
+       tmpLastT = curT;                
+       
        /* S1, S2, S3, S5 */
-       mergeMaximumFreqCSsAll(freqCSset, *labels, superCSFreqCSMap, 
superCSMergeMaxCSMap, numMaxCSs, *maxCSoid);
+       mergeMaximumFreqCSsAll(freqCSset, *labels, superCSFreqCSMap, numMaxCSs, 
&mergecsId);
+
 
        curT = clock(); 
        printf (" ----- Merging Frequent CSs took  %f seconds.\n", 
((float)(curT - tmpLastT))/CLOCKS_PER_SEC);
@@ -3771,7 +3936,6 @@ RDFextractCSwithTypes(int *ret, bat *sba
 
        freeOntoUsageTree(ontoUsageTree);
        free (superCSFreqCSMap);
-       free (superCSMergeMaxCSMap); 
        
        #if NEEDSUBCS
        free (subjSubCSMap);
diff --git a/monetdb5/extras/rdf/rdfschema.h b/monetdb5/extras/rdf/rdfschema.h
--- a/monetdb5/extras/rdf/rdfschema.h
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
http://mail.monetdb.org/mailman/listinfo/checkin-list

Reply via email to