Changeset: 43220a7f567b for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=43220a7f567b
Modified Files:
monetdb5/extras/rdf/rdfminheap.c
monetdb5/extras/rdf/rdfschema.c
Branch: rdf
Log Message:
Implement multi-way merging for multiple CS's.
Currently, checking with S6. Need to check more.
diffs (truncated from 399 to 300 lines):
diff --git a/monetdb5/extras/rdf/rdfminheap.c b/monetdb5/extras/rdf/rdfminheap.c
--- a/monetdb5/extras/rdf/rdfminheap.c
+++ b/monetdb5/extras/rdf/rdfminheap.c
@@ -133,7 +133,7 @@ int *mergeKArrays(int **arr, int k)
}
/* Merge multi property lists in freqCSset. This is used for forming mergeCS */
-#define INIT_MERGELIST_SIZE 1000
+#define INIT_MERGELIST_SIZE 100
oid* mergeMultiPropList(CSset *freqCSset, int *freqIdList, int k, int
*numCombinedP)
{
@@ -142,8 +142,10 @@ oid* mergeMultiPropList(CSset *freqCSset
MinHeap *hp;
int freqIdx;
MinHeapNode *harr;
+ int numAllocation = INIT_MERGELIST_SIZE;
+ oid *tmp;
- oid *output = (oid *) malloc(sizeof(oid) * INIT_MERGELIST_SIZE); //
Output array
+ oid *output = (oid *) malloc(sizeof(oid) * numAllocation); // Output
array
/*
printf("Input list: \n");
@@ -180,6 +182,11 @@ oid* mergeMultiPropList(CSset *freqCSset
if (root.element == INT_MAX) break;
if (output[*numCombinedP - 1] != root.element){ //Only
append the distinct prop to combined list
+ if (*numCombinedP == numAllocation){
+ numAllocation += INIT_MERGELIST_SIZE;
+ tmp = realloc(output, sizeof(oid) *
numAllocation);
+ output = (oid*)tmp;
+ }
output[*numCombinedP] = root.element;
(*numCombinedP)++;
}
diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -963,10 +963,10 @@ void freeCS(CS *cs){
*/
#if STOREFULLCS
static
-CS* creatCS(oid csId, int numP, oid* buff, oid subjectId, oid* lstObject, char
type, int parentfreqIdx, int support, int coverage)
+CS* creatCS(oid csId, int freqIdx, int numP, oid* buff, oid subjectId, oid*
lstObject, char type, int parentfreqIdx, int support, int coverage)
#else
static
-CS* creatCS(oid csId, int numP, oid* buff, char type, int parentfreqIdx, int
support, int coverage)
+CS* creatCS(oid csId, int freqIdx, int numP, oid* buff, char type, int
parentfreqIdx, int support, int coverage)
#endif
{
CS *cs = (CS*)malloc(sizeof(CS));
@@ -1007,6 +1007,11 @@ CS* creatCS(oid csId, int numP, oid* buf
cs->support = support;
cs->coverage = coverage;
+ // For using in the merging process
+ cs->numConsistsOf = 1;
+ cs->lstConsistsOf = (int *) malloc(sizeof(int));
+ cs->lstConsistsOf[0]= freqIdx;
+
return cs;
}
@@ -1212,55 +1217,193 @@ void mergeMultiOidSets(CSset *freqCSset,
}
-
+*/
+/* Get distinct list of mergedFreqIdx */
+static
+int* getDistinctList(int *lstMergeCSFreqId, int num, int *numDistinct){
+ int i;
+ int *lstDistinctFreqId;
+ BAT *tmpBat;
+ int *first;
+ int last;
+
+ lstDistinctFreqId = (int*) malloc(sizeof(int) * num); /* A bit
redundant */
+
+ tmpBat = BATnew(TYPE_void, TYPE_int, num);
+
+ for (i = 0; i < num; i++){
+ tmpBat = BUNappend(tmpBat, &lstMergeCSFreqId[i], TRUE);
+ }
+
+ /* Sort the array of the freqIdx list in order to remove duplication */
+
+ //TODO: Ask whether there is a sorting function available for an array
+ //TODO: Ask why it is not possible by using memcpy
+
+ //memcpy(Tloc(tmpBat, BUNfirst(tmpBat)), lstMergeCSFreqId, sizeof(int)
* num);
+ //memcpy(Hloc(tmpBat, BUNfirst(tmpBat)), hSeq, sizeof(oid) * num);
+ //BATsetcount(tmpBat, (BUN) (tmpBat->batCount + num));
+
+ BATorder(BATmirror(tmpBat));
+
+ first = (int*)Tloc(tmpBat, BUNfirst(tmpBat));
+ last = *first;
+ *numDistinct = 1;
+ lstDistinctFreqId[0] = *first;
+ for (i =1; i < num; i++){
+ first++;
+ if (last != *first){ /*new value*/
+ lstDistinctFreqId[*numDistinct] = *first;
+ (*numDistinct)++;
+ last = *first;
+ }
+ }
+
+ BBPreclaim(tmpBat);
+
+ return lstDistinctFreqId;
+
+}
+/*
+Multi-way merging for list of freqCS
+*/
static
-void mergeMultiCS(CS *freqCSset, int *lstFreqId, int num){
+void mergeMultiCS(CSset *freqCSset, int *lstFreqId, int num, oid *mergecsId){
- int numCombineP;
- int* _tmp1;
- oid* _tmp2;
- oid* oldlstProp1;
- oid* oldlstProp2;
- int i;
-
- _tmp1 = realloc(mergecs1->lstConsistsOf, ((mergecs1->numConsistsOf +
mergecs2->numConsistsOf) * sizeof(int)));
-
- if (!_tmp1){
- fprintf(stderr, "ERROR: Couldn't realloc memory!\n");
+ int i, j, tmpIdx;
+ int *lstMergeCSFreqId, *lstDistinctFreqId;
+ int numDistinct = 0;
+ int mergeNumConsistsOf = 0;
+ int tmpFreqIdx;
+ int tmpConsistFreqIdx;
+ CS *newmergeCS;
+ char isExistingMergeCS = 0;
+ int mergecsFreqIdx = -1;
+ int *_tmp;
+ oid *_tmp2;
+ oid *tmpPropList;
+ int origSupport = 0;
+ int origCoverage = 0;
+ int numCombinedP = 0;
+
+
+
+
+ /* Get the list of merge FreqIdx */
+ lstMergeCSFreqId = (int*) malloc(sizeof(int) * num);
+
+ for (i = 0; i < num; i++){
+ if (freqCSset->items[lstFreqId[i]].parentFreqIdx != -1){
+ lstMergeCSFreqId[i] =
freqCSset->items[lstFreqId[i]].parentFreqIdx;
+ mergecsFreqIdx = lstMergeCSFreqId[i]; //An existing one
+ isExistingMergeCS = 1;
+ }
+ else
+ lstMergeCSFreqId[i] = lstFreqId[i];
}
- mergecs1->lstConsistsOf = (int*)_tmp1;
- for (i = 0; i < mergecs2->numConsistsOf; i++){
- mergecs1->lstConsistsOf[mergecs1->numConsistsOf] =
mergecs2->lstConsistsOf[i];
- mergecs1->numConsistsOf++;
+ if (isExistingMergeCS == 0) mergecsFreqIdx = freqCSset->numCSadded;
+
+ lstDistinctFreqId = getDistinctList(lstMergeCSFreqId,num, &numDistinct);
+
+ if (numDistinct < 2){
+ free(lstMergeCSFreqId);
+ free(lstDistinctFreqId);
+ return;
}
-
- oldlstProp1 = malloc (sizeof(oid) * mergecs1->numProp);
- memcpy(oldlstProp1, mergecs1->lstProp, (mergecs1->numProp) *
sizeof(oid));
+ /* Create or not create a new CS */
+ if (isExistingMergeCS){
+ newmergeCS = (CS*) &(freqCSset->items[mergecsFreqIdx]);
+ origSupport = newmergeCS->support;
+ origCoverage = newmergeCS->coverage;
+ }
+ else{
+ newmergeCS = (CS*) malloc (sizeof (CS));
+ newmergeCS->support = 0;
+ newmergeCS->coverage = 0;
+ }
+
+ /*Reset parentIdx */
+ newmergeCS->parentFreqIdx = -1;
+ newmergeCS->type = MERGECS;
- oldlstProp2 = malloc (sizeof(oid) * mergecs2->numProp);
- memcpy(oldlstProp2, mergecs2->lstProp, (mergecs2->numProp) *
sizeof(oid));
-
- _tmp2 = realloc(mergecs1->lstProp, ((mergecs1->numProp +
mergecs2->numProp) * sizeof(oid)));
-
- if (!_tmp2){
- fprintf(stderr, "ERROR: Couldn't realloc memory!\n");
+
+ /* Calculate number of consistsOf in the merged CS
+
+ and Update support and coverage: Total of all suppors */
+
+ for (i = 0; i < numDistinct; i++){
+ tmpFreqIdx = lstDistinctFreqId[i];
+ mergeNumConsistsOf +=
freqCSset->items[tmpFreqIdx].numConsistsOf;
}
- mergecs1->lstProp = (oid*)_tmp2;
-
- mergeOidSets(oldlstProp1, oldlstProp2, mergecs1->lstProp,
mergecs1->numProp, mergecs2->numProp, &numCombineP);
-
- mergecs1->numProp = numCombineP;
- mergecs1->support += mergecs2->support;
- mergecs1->coverage += mergecs2->coverage;
-
- // Remove mergecs2
- mergecs2->parentFreqIdx = parentFreqIdx;
-
- free(oldlstProp1);
- free(oldlstProp2);
+
+ printf("Number of freqCS consisted in mergeCS: %d \n",
mergeNumConsistsOf);
+ if (isExistingMergeCS){
+ _tmp = realloc(newmergeCS->lstConsistsOf, sizeof(int) *
mergeNumConsistsOf);
+ if (!_tmp){
+ fprintf(stderr, "ERROR: Couldn't realloc memory!\n");
+ }
+ newmergeCS->lstConsistsOf = (int*)_tmp;
+ }
+ else{
+ newmergeCS->lstConsistsOf = (int*)malloc(sizeof(int) *
mergeNumConsistsOf);
+ }
+
+ tmpIdx = 0;
+ for (i = 0; i < numDistinct; i++){
+ tmpFreqIdx = lstDistinctFreqId[i];
+ for (j = 0; j < freqCSset->items[tmpFreqIdx].numConsistsOf;
j++){
+ tmpConsistFreqIdx =
freqCSset->items[tmpFreqIdx].lstConsistsOf[j];
+ newmergeCS->lstConsistsOf[tmpIdx] = tmpConsistFreqIdx;
+ //Reset the parentFreqIdx
+ freqCSset->items[tmpConsistFreqIdx].parentFreqIdx =
mergecsFreqIdx;
+ tmpIdx++;
+ }
+ //Update support
+ newmergeCS->support += freqCSset->items[tmpFreqIdx].support;
+ newmergeCS->coverage += freqCSset->items[tmpFreqIdx].coverage;
+ }
+ assert(tmpIdx == mergeNumConsistsOf);
+ newmergeCS->numConsistsOf = mergeNumConsistsOf;
+
+ // Has to minus the support & coverage of the existed CS
+ newmergeCS->support = newmergeCS->support - origSupport;
+ newmergeCS->coverage = newmergeCS->coverage - origCoverage;
+
+ /*Merge the list of prop list */
+ tmpPropList = mergeMultiPropList(freqCSset, lstDistinctFreqId,
numDistinct, &numCombinedP);
+ printf("Combined P has %d props \n", numCombinedP);
+ if (isExistingMergeCS){ //For existed mergeCS, reallocate
lstProp
+ _tmp2 = realloc(newmergeCS->lstProp, sizeof(oid) *
numCombinedP);
+
+ if (!_tmp2){
+ fprintf(stderr, "ERROR: Couldn't realloc memory!\n");
+ }
+ newmergeCS->lstProp = (oid*)_tmp2;
+ memcpy(newmergeCS->lstProp, tmpPropList, numCombinedP *
sizeof(oid));
+ newmergeCS->numProp = numCombinedP;
+ newmergeCS->numAllocation = numCombinedP;
+ }
+ else{
+ newmergeCS->lstProp = (oid*) malloc(sizeof(oid) *
numCombinedP);
+ memcpy(newmergeCS->lstProp, tmpPropList, numCombinedP *
sizeof(oid));
+ newmergeCS->numProp = numCombinedP;
+ newmergeCS->numAllocation = numCombinedP;
+ newmergeCS->csId = *mergecsId;
+ mergecsId[0]++;
+ }
+
+ free(tmpPropList);
+
+
+ if (isExistingMergeCS == 0){
+ addCStoSet(freqCSset, *newmergeCS);
+ free(newmergeCS);
+ }
+
+ free(lstMergeCSFreqId);
+ free(lstDistinctFreqId);
}
-*/
static
str printFreqCSSet(CSset *freqCSset, BAT *freqBat, BAT *mapbat, char
isWriteTofile, int freqThreshold, CSlabel* labels){
@@ -1802,9 +1945,9 @@ oid putaCStoHash(CSBats *csBats, oid* ke
_______________________________________________
checkin-list mailing list
[email protected]
http://mail.monetdb.org/mailman/listinfo/checkin-list