Changeset: 43220a7f567b for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=43220a7f567b
Modified Files:
        monetdb5/extras/rdf/rdfminheap.c
        monetdb5/extras/rdf/rdfschema.c
Branch: rdf
Log Message:

Implement multi-way merging for multiple CS's.

Currently, checking with S6. Need to check more.


diffs (truncated from 399 to 300 lines):

diff --git a/monetdb5/extras/rdf/rdfminheap.c b/monetdb5/extras/rdf/rdfminheap.c
--- a/monetdb5/extras/rdf/rdfminheap.c
+++ b/monetdb5/extras/rdf/rdfminheap.c
@@ -133,7 +133,7 @@ int *mergeKArrays(int **arr, int k)
 }
 
 /* Merge multi property lists in freqCSset. This is used for forming mergeCS */
-#define INIT_MERGELIST_SIZE 1000 
+#define INIT_MERGELIST_SIZE 100 
 
 oid* mergeMultiPropList(CSset *freqCSset, int *freqIdList, int k, int 
*numCombinedP)
 {
@@ -142,8 +142,10 @@ oid* mergeMultiPropList(CSset *freqCSset
        MinHeap *hp;
        int freqIdx; 
        MinHeapNode *harr;
+       int numAllocation = INIT_MERGELIST_SIZE;
+       oid *tmp;
 
-       oid *output = (oid *) malloc(sizeof(oid) * INIT_MERGELIST_SIZE); // 
Output array
+       oid *output = (oid *) malloc(sizeof(oid) * numAllocation); // Output 
array
 
        /*
        printf("Input list: \n");
@@ -180,6 +182,11 @@ oid* mergeMultiPropList(CSset *freqCSset
                if (root.element == INT_MAX) break; 
                
                if (output[*numCombinedP - 1] != root.element){         //Only 
append the distinct prop to combined list
+                       if (*numCombinedP == numAllocation){
+                               numAllocation += INIT_MERGELIST_SIZE;           
+                               tmp = realloc(output, sizeof(oid) * 
numAllocation); 
+                               output = (oid*)tmp; 
+                       }
                        output[*numCombinedP] = root.element;
                        (*numCombinedP)++;
                }
diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -963,10 +963,10 @@ void freeCS(CS *cs){
 */
 #if STOREFULLCS
 static
-CS* creatCS(oid csId, int numP, oid* buff, oid subjectId, oid* lstObject, char 
type, int parentfreqIdx, int support, int coverage)
+CS* creatCS(oid csId, int freqIdx, int numP, oid* buff, oid subjectId, oid* 
lstObject, char type, int parentfreqIdx, int support, int coverage)
 #else
 static 
-CS* creatCS(oid csId, int numP, oid* buff, char type,  int parentfreqIdx, int 
support, int coverage)
+CS* creatCS(oid csId, int freqIdx, int numP, oid* buff, char type,  int 
parentfreqIdx, int support, int coverage)
 #endif 
 {
        CS *cs = (CS*)malloc(sizeof(CS)); 
@@ -1007,6 +1007,11 @@ CS* creatCS(oid csId, int numP, oid* buf
        cs->support = support;
        cs->coverage = coverage; 
 
+       // For using in the merging process
+       cs->numConsistsOf = 1;
+       cs->lstConsistsOf = (int *) malloc(sizeof(int)); 
+       cs->lstConsistsOf[0]= freqIdx; 
+
        return cs; 
 }
 
@@ -1212,55 +1217,193 @@ void mergeMultiOidSets(CSset *freqCSset,
                
 }
 
-
+*/
+/* Get distinct list of mergedFreqIdx */
+static  
+int* getDistinctList(int *lstMergeCSFreqId, int num, int *numDistinct){
+       int i; 
+       int *lstDistinctFreqId;
+       BAT     *tmpBat; 
+       int     *first; 
+       int     last; 
+
+       lstDistinctFreqId = (int*) malloc(sizeof(int) * num); /* A bit 
redundant */
+       
+       tmpBat = BATnew(TYPE_void, TYPE_int, num);
+
+       for (i = 0; i < num; i++){
+               tmpBat = BUNappend(tmpBat, &lstMergeCSFreqId[i], TRUE);
+       }
+
+       /* Sort the array of the freqIdx list in order to remove duplication */
+       
+       //TODO: Ask whether there is a sorting function available for an array
+       //TODO: Ask why it is not possible by using memcpy
+       
+       //memcpy(Tloc(tmpBat, BUNfirst(tmpBat)), lstMergeCSFreqId, sizeof(int) 
* num); 
+       //memcpy(Hloc(tmpBat, BUNfirst(tmpBat)), hSeq, sizeof(oid) * num); 
+       //BATsetcount(tmpBat, (BUN) (tmpBat->batCount + num));
+
+       BATorder(BATmirror(tmpBat));
+
+       first = (int*)Tloc(tmpBat, BUNfirst(tmpBat));
+       last = *first; 
+       *numDistinct = 1; 
+       lstDistinctFreqId[0] = *first; 
+       for (i =1; i < num; i++){
+               first++; 
+               if (last != *first){    /*new value*/
+                       lstDistinctFreqId[*numDistinct] = *first; 
+                       (*numDistinct)++;
+                       last = *first; 
+               }
+       }
+
+       BBPreclaim(tmpBat);
+
+       return lstDistinctFreqId; 
+
+}
+/*
+Multi-way merging for list of freqCS
+*/
 static 
-void mergeMultiCS(CS *freqCSset, int *lstFreqId, int num){
+void mergeMultiCS(CSset *freqCSset, int *lstFreqId, int num, oid *mergecsId){
        
-       int numCombineP; 
-       int* _tmp1; 
-       oid* _tmp2; 
-       oid* oldlstProp1; 
-       oid* oldlstProp2; 
-       int i; 
-
-        _tmp1 = realloc(mergecs1->lstConsistsOf, ((mergecs1->numConsistsOf + 
mergecs2->numConsistsOf) * sizeof(int)));
-
-       if (!_tmp1){
-               fprintf(stderr, "ERROR: Couldn't realloc memory!\n");
+       int     i, j, tmpIdx; 
+       int     *lstMergeCSFreqId, *lstDistinctFreqId; 
+       int     numDistinct = 0; 
+       int     mergeNumConsistsOf = 0;
+       int     tmpFreqIdx; 
+       int     tmpConsistFreqIdx; 
+       CS      *newmergeCS; 
+       char    isExistingMergeCS = 0;
+       int     mergecsFreqIdx = -1;
+       int     *_tmp; 
+       oid     *_tmp2; 
+       oid     *tmpPropList; 
+       int     origSupport = 0;
+       int     origCoverage = 0;
+       int     numCombinedP = 0; 
+       
+
+
+
+       /* Get the list of merge FreqIdx */
+       lstMergeCSFreqId = (int*) malloc(sizeof(int) * num); 
+
+       for (i = 0; i < num; i++){
+               if (freqCSset->items[lstFreqId[i]].parentFreqIdx != -1){
+                       lstMergeCSFreqId[i] = 
freqCSset->items[lstFreqId[i]].parentFreqIdx;
+                       mergecsFreqIdx = lstMergeCSFreqId[i]; //An existing one
+                       isExistingMergeCS = 1; 
+               }
+               else
+                       lstMergeCSFreqId[i] = lstFreqId[i];
        }
-       mergecs1->lstConsistsOf = (int*)_tmp1;
-       for (i = 0; i < mergecs2->numConsistsOf; i++){
-               mergecs1->lstConsistsOf[mergecs1->numConsistsOf] = 
mergecs2->lstConsistsOf[i]; 
-               mergecs1->numConsistsOf++;
+       if (isExistingMergeCS == 0) mergecsFreqIdx = freqCSset->numCSadded; 
+
+       lstDistinctFreqId = getDistinctList(lstMergeCSFreqId,num, &numDistinct);
+
+       if (numDistinct < 2){
+               free(lstMergeCSFreqId);
+               free(lstDistinctFreqId);
+               return;
        }
 
-
-       oldlstProp1 = malloc (sizeof(oid) * mergecs1->numProp); 
-       memcpy(oldlstProp1, mergecs1->lstProp, (mergecs1->numProp) * 
sizeof(oid));
+       /* Create or not create a new CS */
+       if (isExistingMergeCS){
+               newmergeCS = (CS*) &(freqCSset->items[mergecsFreqIdx]);
+               origSupport = newmergeCS->support; 
+               origCoverage = newmergeCS->coverage;
+       }
+       else{
+               newmergeCS = (CS*) malloc (sizeof (CS));
+               newmergeCS->support = 0;
+               newmergeCS->coverage = 0; 
+       }
+
+       /*Reset parentIdx */
+       newmergeCS->parentFreqIdx = -1;
+       newmergeCS->type = MERGECS;
        
-       oldlstProp2 = malloc (sizeof(oid) * mergecs2->numProp); 
-       memcpy(oldlstProp2, mergecs2->lstProp, (mergecs2->numProp) * 
sizeof(oid));
-
-        _tmp2 = realloc(mergecs1->lstProp, ((mergecs1->numProp + 
mergecs2->numProp) * sizeof(oid)));
-
-       if (!_tmp2){
-               fprintf(stderr, "ERROR: Couldn't realloc memory!\n");
+
+       /* Calculate number of consistsOf in the merged CS 
+
+        and  Update support and coverage: Total of all suppors */
+
+       for (i = 0; i < numDistinct; i++){
+               tmpFreqIdx = lstDistinctFreqId[i]; 
+               mergeNumConsistsOf += 
freqCSset->items[tmpFreqIdx].numConsistsOf; 
        }
-       mergecs1->lstProp = (oid*)_tmp2;
-
-       mergeOidSets(oldlstProp1, oldlstProp2, mergecs1->lstProp, 
mergecs1->numProp, mergecs2->numProp, &numCombineP); 
-
-       mergecs1->numProp = numCombineP;
-       mergecs1->support += mergecs2->support;
-       mergecs1->coverage += mergecs2->coverage;
-
-       // Remove mergecs2
-       mergecs2->parentFreqIdx = parentFreqIdx; 
-
-       free(oldlstProp1);
-       free(oldlstProp2); 
+       
+       printf("Number of freqCS consisted in mergeCS: %d \n", 
mergeNumConsistsOf);
+       if (isExistingMergeCS){
+               _tmp = realloc(newmergeCS->lstConsistsOf, sizeof(int) * 
mergeNumConsistsOf); 
+               if (!_tmp){
+                       fprintf(stderr, "ERROR: Couldn't realloc memory!\n");
+               }
+               newmergeCS->lstConsistsOf = (int*)_tmp;
+       }
+       else{
+               newmergeCS->lstConsistsOf = (int*)malloc(sizeof(int)  * 
mergeNumConsistsOf);
+       }
+
+       tmpIdx = 0;
+       for (i = 0; i < numDistinct; i++){
+               tmpFreqIdx = lstDistinctFreqId[i];
+               for (j = 0; j < freqCSset->items[tmpFreqIdx].numConsistsOf; 
j++){
+                       tmpConsistFreqIdx =  
freqCSset->items[tmpFreqIdx].lstConsistsOf[j];
+                       newmergeCS->lstConsistsOf[tmpIdx] = tmpConsistFreqIdx; 
+                       //Reset the parentFreqIdx
+                       freqCSset->items[tmpConsistFreqIdx].parentFreqIdx = 
mergecsFreqIdx;
+                       tmpIdx++;
+               }
+               //Update support
+               newmergeCS->support += freqCSset->items[tmpFreqIdx].support; 
+               newmergeCS->coverage += freqCSset->items[tmpFreqIdx].coverage;
+       }
+       assert(tmpIdx == mergeNumConsistsOf);
+       newmergeCS->numConsistsOf = mergeNumConsistsOf;
+
+       // Has to minus the support & coverage of the existed CS
+       newmergeCS->support = newmergeCS->support - origSupport;
+       newmergeCS->coverage = newmergeCS->coverage - origCoverage; 
+
+       /*Merge the list of prop list */
+       tmpPropList = mergeMultiPropList(freqCSset, lstDistinctFreqId, 
numDistinct, &numCombinedP);
+       printf("Combined P has %d props \n", numCombinedP); 
+       if (isExistingMergeCS){         //For existed mergeCS, reallocate 
lstProp       
+               _tmp2 = realloc(newmergeCS->lstProp, sizeof(oid) * 
numCombinedP); 
+               
+               if (!_tmp2){
+                       fprintf(stderr, "ERROR: Couldn't realloc memory!\n");
+               }
+               newmergeCS->lstProp = (oid*)_tmp2; 
+               memcpy(newmergeCS->lstProp, tmpPropList, numCombinedP * 
sizeof(oid));
+               newmergeCS->numProp = numCombinedP;
+               newmergeCS->numAllocation = numCombinedP;
+       }
+       else{
+               newmergeCS->lstProp =  (oid*) malloc(sizeof(oid) * 
numCombinedP);
+               memcpy(newmergeCS->lstProp, tmpPropList, numCombinedP * 
sizeof(oid));           
+               newmergeCS->numProp = numCombinedP;
+               newmergeCS->numAllocation = numCombinedP;
+               newmergeCS->csId = *mergecsId;
+               mergecsId[0]++;
+       }
+
+       free(tmpPropList); 
+
+       
+       if (isExistingMergeCS == 0){
+               addCStoSet(freqCSset, *newmergeCS);
+               free(newmergeCS); 
+       }
+
+       free(lstMergeCSFreqId);
+       free(lstDistinctFreqId);
 }
-*/
 
 static 
 str printFreqCSSet(CSset *freqCSset, BAT *freqBat, BAT *mapbat, char 
isWriteTofile, int freqThreshold, CSlabel* labels){
@@ -1802,9 +1945,9 @@ oid putaCStoHash(CSBats *csBats, oid* ke
_______________________________________________
checkin-list mailing list
[email protected]
http://mail.monetdb.org/mailman/listinfo/checkin-list

Reply via email to