Changeset: 78cc3766e723 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=78cc3766e723
Modified Files:
monetdb5/extras/rdf/rdfschema.c
monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:
Fix the bug while probing the FK relationship in large dataset.
- Pay attention to the last value return by HASHloop. It is BUN_NONE.
diffs (269 lines):
diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -80,6 +80,91 @@ void addCStoSet(CSset *csSet, CS item)
}
static
+CSrel* creataCSrel(oid csoid){
+ CSrel *csrel = malloc(sizeof(CSrel));
+ csrel->origCSoid = csoid;
+ csrel->lstRefCSoid = (oid*) malloc(sizeof(oid) * INIT_NUM_CSREL);
+ csrel->lstCnt = (int*) malloc(sizeof(int) * INIT_NUM_CSREL);
+ csrel->numRef = 0;
+ csrel->numAllocation = INIT_NUM_CSREL;
+
+ return csrel;
+}
+
+static
+CSrel* initCSrelset(oid numCSrel){
+ oid i;
+ CSrel *csrelSet = malloc(sizeof(CSrel) * numCSrel);
+ CSrel *csrel;
+ for (i = 0; i < numCSrel; i++){
+ csrel = creataCSrel(i);
+ csrelSet[i] = (CSrel) *csrel;
+ }
+ return csrelSet;
+}
+
+static
+void printCSrelSet(CSrel *csrelSet, int num){
+
+ int i;
+ int j;
+ for (i = 0; i < num; i++){
+ if (csrelSet[i].numRef != 0){ //Only print CS with FK
+ printf("Relationship i: ");
+ printf("CS " BUNFMT " --> ", csrelSet[i].origCSoid);
+ for (j = 0; j < csrelSet[i].numRef; j++){
+ printf(BUNFMT " (%d) ",
csrelSet[i].lstRefCSoid[j],csrelSet[i].lstCnt[j]);
+ }
+ printf("\n");
+ }
+ }
+}
+
+static
+void addReltoCSRel(oid origCSoid, oid refCSoid, CSrel *csrel)
+{
+ void *_tmp;
+ void *_tmp2;
+
+ int i = 0;
+
+ assert (origCSoid == csrel->origCSoid);
+
+ while (i < csrel->numRef){
+ if (refCSoid == csrel->lstRefCSoid[i]){
+ //Existing
+ break;
+ }
+ i++;
+ }
+
+ if (i != csrel->numRef){
+ csrel->lstCnt[i]++;
+ return;
+ }
+ else{ // New Ref
+
+ if(csrel->numRef == csrel->numAllocation)
+ {
+ csrel->numAllocation += INIT_NUM_CSREL;
+
+ _tmp = realloc(csrel->lstRefCSoid,
(csrel->numAllocation * sizeof(oid)));
+ _tmp2 = realloc(csrel->lstCnt, (csrel->numAllocation *
sizeof(int)));
+
+ if (!_tmp || !_tmp2){
+ fprintf(stderr, "ERROR: Couldn't realloc
memory!\n");
+ }
+ csrel->lstRefCSoid = (oid*)_tmp;
+ csrel->lstCnt = (int*)_tmp2;
+ }
+
+ csrel->lstRefCSoid[csrel->numRef] = refCSoid;
+ csrel->lstCnt[csrel->numRef] = 1;
+ csrel->numRef++;
+ }
+}
+
+static
void freeCSset(CSset *csSet){
int i;
for(i = 0; i < csSet->numCSadded; i ++){
@@ -171,7 +256,7 @@ char checkCSduplication(BAT* hsKeyBat, B
BATiter bi = bat_iterator(BATmirror(hsKeyBat));
HASHloop(bi, hsKeyBat->T->hash, pos, (ptr) &cskey){
- printf(" pos: " BUNFMT, pos);
+ //printf(" pos: " BUNFMT, pos);
offset = (oid *) Tloc(pOffsetBat, pos);
if ((pos + 1) < pOffsetBat->batCount){
@@ -204,16 +289,19 @@ char checkCSduplication(BAT* hsKeyBat, B
//Everything match
if (isDuplication == 1){
+ //printf("Everything match!!!!!");
*csId = pos;
return 1;
}
}
+
+
+ }
+
- }
+ *csId = pos; // = BUN_NONE
- *csId = pos;
-
- return 1;
+ return 0;
}
/*
@@ -300,9 +388,10 @@ oid putaCStoHash(CSBats *csBats, oid sub
if (bun == BUN_NONE) {
addNewCS(csBats, &csKey, key, csoid, num);
csId = *csoid;
+ //assert(csId != BUN_NONE);
}
else{
- printf("Same HashKey: ");
+ //printf("Same HashKey: ");
/* Check whether it is really an duplication (same hashvalue
but different list of */
isDuplicate = checkCSduplication(csBats->hsKeyBat,
csBats->pOffsetBat, csBats->fullPBat, csKey, key, num, &csId);
@@ -314,7 +403,7 @@ oid putaCStoHash(CSBats *csBats, oid sub
}
else{
- printf(" Duplication (existed CS) at csId = " BUNFMT
"\n", csId);
+ //printf(" Duplication (existed CS) at csId = " BUNFMT
"\n", csId);
// Update freqCS value
freq = (int *)Tloc(csBats->freqBat, csId);
@@ -330,6 +419,12 @@ oid putaCStoHash(CSBats *csBats, oid sub
}
}
+ if (csId == BUN_NONE){
+ printf("Not acceptable cdId " BUNFMT " \n", csId);
+ }
+
+ //assert(csId != BUN_NONE);
+
return csId;
}
@@ -620,7 +715,7 @@ void freeCSBats(CSBats *csBats){
static
-str RDFassignCSId(int *ret, BAT *sbat, BATiter si, BATiter pi, CSset
*freqCSset, int *freqThreshold, CSBats* csBats, oid *subjCSMap){
+str RDFassignCSId(int *ret, BAT *sbat, BATiter si, BATiter pi, CSset
*freqCSset, int *freqThreshold, CSBats* csBats, oid *subjCSMap, oid *maxCSoid){
BUN p, q;
oid *sbt, *pbt;
@@ -650,6 +745,9 @@ str RDFassignCSId(int *ret, BAT *sbat, B
if (numP > maxNumProp)
maxNumProp = numP;
+ if (returnCSid > *maxCSoid)
+ *maxCSoid = returnCSid;
+
}
curS = *sbt;
curP = 0;
@@ -679,6 +777,9 @@ str RDFassignCSId(int *ret, BAT *sbat, B
if (numP > maxNumProp)
maxNumProp = numP;
+ if (returnCSid > *maxCSoid)
+ *maxCSoid = returnCSid;
+
free (buff);
*ret = 1;
@@ -688,7 +789,7 @@ str RDFassignCSId(int *ret, BAT *sbat, B
static
str RDFrelationships(int *ret, BAT *sbat, BATiter si, BATiter pi, BATiter oi,
CSset *freqCSset,
- int *freqThreshold, CSBats* csBats, oid *subjCSMap, BUN
maxSoid){
+ int *freqThreshold, CSBats* csBats, oid *subjCSMap, BUN
maxSoid, BUN maxCSoid){
BUN p, q;
oid *sbt, *pbt, *obt;
@@ -701,6 +802,9 @@ str RDFrelationships(int *ret, BAT *sbat
int maxNumProp = 0;
oid objType;
oid returnCSid;
+ CSrel* csrelSet;
+
+ csrelSet = initCSrelset(maxCSoid);
buff = (oid *) malloc (sizeof(oid) * INIT_PROPERTY_NUM);
@@ -744,7 +848,8 @@ str RDFrelationships(int *ret, BAT *sbat
/* Look at sbat*/
if (objType == URI){
if (*obt <= maxSoid && subjCSMap[*obt] != BUN_NONE){
- printf(" CS " BUNFMT " refer to CS " BUNFMT "
\n",*sbt, subjCSMap[*obt]);
+ ////printf(" Subject " BUNFMT " refer to CS "
BUNFMT " \n",*sbt, subjCSMap[*obt]);
+ addReltoCSRel(subjCSMap[*sbt], subjCSMap[*obt],
&csrelSet[subjCSMap[*sbt]]);
}
}
}
@@ -759,6 +864,8 @@ str RDFrelationships(int *ret, BAT *sbat
free (buff);
+ printCSrelSet(csrelSet,maxCSoid);
+
*ret = 1;
return MAL_SUCCEED;
@@ -775,6 +882,7 @@ RDFextractCSwithTypes(int *ret, bat *sba
CSBats *csBats;
oid *subjCSMap; /* Store the correspoinding CS Id for each
subject */
BUN *maxSoid;
+ oid maxCSoid = 0;
if ((sbat = BATdescriptor(*sbatid)) == NULL) {
throw(MAL, "rdf.RDFextractCSwithTypes", RUNTIME_OBJECT_MISSING);
@@ -805,10 +913,12 @@ RDFextractCSwithTypes(int *ret, bat *sba
initArray(subjCSMap, (*maxSoid), BUN_NONE);
//Phase 1: Assign an ID for each CS
- RDFassignCSId(ret, sbat, si, pi, freqCSset, freqThreshold, csBats,
subjCSMap);
+ RDFassignCSId(ret, sbat, si, pi, freqCSset, freqThreshold, csBats,
subjCSMap, &maxCSoid);
+
+ printf("Max CS oid: " BUNFMT "\n", maxCSoid);
//Phase 2: Check the relationship
- RDFrelationships(ret, sbat, si, pi, oi, freqCSset, freqThreshold,
csBats, subjCSMap, *maxSoid);
+ RDFrelationships(ret, sbat, si, pi, oi, freqCSset, freqThreshold,
csBats, subjCSMap, *maxSoid, maxCSoid);
printf("Number of frequent CSs is: %d \n", freqCSset->numCSadded);
diff --git a/monetdb5/extras/rdf/rdfschema.h b/monetdb5/extras/rdf/rdfschema.h
--- a/monetdb5/extras/rdf/rdfschema.h
+++ b/monetdb5/extras/rdf/rdfschema.h
@@ -57,4 +57,13 @@ typedef struct CSset{
int numAllocation;
} CSset;
+#define INIT_NUM_CSREL 4
+typedef struct CSrel{
+ oid origCSoid;
+ oid* lstRefCSoid;
+ int* lstCnt; // Count per reference
+ int numRef;
+ int numAllocation;
+} CSrel;
+
#endif /* _RDFSCHEMA_H_ */
_______________________________________________
checkin-list mailing list
[email protected]
http://mail.monetdb.org/mailman/listinfo/checkin-list