Changeset: f6c6a688a1ae for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=f6c6a688a1ae
Modified Files:
monetdb5/extras/rdf/rdfschema.c
Branch: rdf
Log Message:
Fix the problem of duplicated hash keys.
diffs (truncated from 426 to 300 lines):
diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -159,42 +159,125 @@ void appendArrayToBat(BAT *b, BUN* inArr
}
static
-char checkCSduplication(BAT* pOffsetBat, BAT* fullPBat, BUN pos, oid* key, int
numK){
+char checkCSduplication(BAT* hsKeyBat, BAT* pOffsetBat, BAT* fullPBat, BUN
cskey, oid* key, int numK, oid *csId){
oid *offset;
oid *offset2;
int numP;
int i;
BUN *existvalue;
+ BUN pos;
+ char isDuplication = 0;
- offset = (oid *) Tloc(pOffsetBat, pos);
- if ((pos + 1) < pOffsetBat->batCount){
- offset2 = (oid *)Tloc(pOffsetBat, pos + 1);
- numP = *offset2 - *offset;
- }
- else{
- offset2 = malloc(sizeof(oid));
- *offset2 = BUNlast(fullPBat);
- numP = *offset2 - *offset;
- free(offset2);
+ BATiter bi = bat_iterator(BATmirror(hsKeyBat));
+
+ HASHloop(bi, hsKeyBat->T->hash, pos, (ptr) &cskey){
+ printf(" pos: " BUNFMT, pos);
+
+ offset = (oid *) Tloc(pOffsetBat, pos);
+ if ((pos + 1) < pOffsetBat->batCount){
+ offset2 = (oid *)Tloc(pOffsetBat, pos + 1);
+ numP = *offset2 - *offset;
+ }
+ else{
+ offset2 = malloc(sizeof(oid));
+ *offset2 = BUNlast(fullPBat);
+ numP = *offset2 - *offset;
+ free(offset2);
+ }
+
+
+ // Check each value
+ if (numK != numP) {
+ continue;
+ }
+ else{
+ isDuplication = 1;
+ existvalue = (oid *)Tloc(fullPBat, *offset);
+ for (i = 0; i < numP; i++){
+ //if (key[i] != (int)*existvalue++) {
+ if (key[i] != existvalue[i]) {
+ isDuplication = 0;
+ break;
+ }
+ }
+
+
+ //Everything match
+ if (isDuplication == 1){
+ *csId = pos;
+ return 1;
+ }
+ }
+
}
+ *csId = pos;
- // Check each value
- if (numK != numP) {
- return 0;
- }
- else{
- existvalue = (oid *)Tloc(fullPBat, *offset);
- for (i = 0; i < numP; i++){
- //if (key[i] != (int)*existvalue++) {
- if (key[i] != existvalue[i]) {
- return 0;
- }
+ return 1;
+}
+
+/*
+static
+void testBatHash(void){
+
+ BUN bun;
+ BAT* testBat;
+ int i;
+ oid key[7] = {3,5,6,3,5,7,5};
+ oid csKey;
+
+ testBat = BATnew(TYPE_void, TYPE_oid, smallbatsz);
+
+ for (i = 0; i < 7; i++){
+ csKey = key[i];
+ bun = BUNfnd(BATmirror(testBat),(ptr) &key[i]);
+ if (bun == BUN_NONE) {
+ if (testBat->T->hash && BATcount(testBat) > 4 *
testBat->T->hash->mask) {
+ HASHdestroy(testBat);
+ BAThash(BATmirror(testBat),
2*BATcount(testBat));
+ }
+
+ testBat = BUNappend(testBat, (ptr) &csKey, TRUE);
+
+ }
+ else{
+
+ printf("Input: " BUNFMT, csKey);
+ printf(" --> bun: " BUNFMT "\n", bun);
+
+
+
+ testBat = BUNappend(testBat, (ptr) &csKey, TRUE);
+
}
}
-
- return 1;
-}
+ BATprint(testBat);
+
+ BBPreclaim(testBat);
+}
+*/
+
+static
+void addNewCS(CSBats *csBats, BUN* csKey, oid* key, oid *csoid, int num){
+ int freq = 1;
+ BUN offset;
+
+ if (csBats->hsKeyBat->T->hash && BATcount(csBats->hsKeyBat) > 4 *
csBats->hsKeyBat->T->hash->mask) {
+ HASHdestroy(csBats->hsKeyBat);
+ BAThash(BATmirror(csBats->hsKeyBat),
2*BATcount(csBats->hsKeyBat));
+ }
+
+ csBats->hsKeyBat = BUNappend(csBats->hsKeyBat, csKey, TRUE);
+
+ (*csoid)++;
+
+ offset = BUNlast(csBats->fullPBat);
+ /* Add list of p to fullPBat and pOffsetBat*/
+ BUNappend(csBats->pOffsetBat, &offset , TRUE);
+ appendArrayToBat(csBats->fullPBat, key, num);
+
+ BUNappend(csBats->freqBat, &freq, TRUE);
+}
/*
* Put a CS to the hashmap.
* While putting CS to the hashmap, update the support (frequency) value
@@ -204,51 +287,45 @@ char checkCSduplication(BAT* pOffsetBat,
* */
static
oid putaCStoHash(CSBats *csBats, oid subjId, oid* key, int num,
- oid *csoid, char isStoreFreqCS, int freqThreshold, CSset
**freqCSset){
+ oid *csoid, char isStoreFreqCS, int freqThreshold, CSset
*freqCSset){
BUN csKey;
- int freq = 0;
+ int *freq;
CS *freqCS;
BUN bun;
- BUN offset;
oid csId; /* Id of the characteristic set */
char isDuplicate = 0;
csKey = RDF_hash_oidlist(key, num);
bun = BUNfnd(BATmirror(csBats->hsKeyBat),(ptr) &csKey);
if (bun == BUN_NONE) {
- if (csBats->hsKeyBat->T->hash && BATcount(csBats->hsKeyBat) > 4
* csBats->hsKeyBat->T->hash->mask) {
- HASHdestroy(csBats->hsKeyBat);
- BAThash(BATmirror(csBats->hsKeyBat),
2*BATcount(csBats->hsKeyBat));
- }
-
- csBats->hsKeyBat = BUNappend(csBats->hsKeyBat, (ptr) &csKey,
TRUE);
-
-
- csId = *csoid;
- (*csoid)++;
-
- offset = BUNlast(csBats->fullPBat);
- /* Add list of p to fullPBat and pOffsetBat*/
- BUNappend(csBats->pOffsetBat, &offset , TRUE);
- appendArrayToBat(csBats->fullPBat, key, num);
-
+ addNewCS(csBats, &csKey, key, csoid, num);
+ csId = *csoid;
}
else{
printf("Same HashKey: ");
- csId = bun;
/* Check whether it is really an duplication (same hashvalue
but different list of */
- isDuplicate = checkCSduplication(csBats->pOffsetBat,
csBats->fullPBat, bun, key, num );
+ isDuplicate = checkCSduplication(csBats->hsKeyBat,
csBats->pOffsetBat, csBats->fullPBat, csKey, key, num, &csId);
- if (isDuplicate == 0)
+ if (isDuplicate == 0) {
printf(" No duplication (new CS) \n");
- else
- printf(" Duplication (existed CS) \n");
+ // New CS
+ addNewCS(csBats, &csKey, key, csoid, num);
+ csId = *csoid;
- if (isStoreFreqCS == 1){ /* Store the frequent CS to the
CSset*/
- //printf("FreqCS: Support = %d, Threshold %d \n ",
freq, freqThreshold);
- if (freq == freqThreshold){
- freqCS = creatCS(subjId, num, key);
- addCStoSet(*freqCSset, *freqCS);
+ }
+ else{
+ printf(" Duplication (existed CS) at csId = " BUNFMT
"\n", csId);
+
+ // Update freqCS value
+ freq = (int *)Tloc(csBats->freqBat, csId);
+ (*freq)++;
+
+ if (isStoreFreqCS == 1){ /* Store the frequent
CS to the CSset*/
+ //printf("FreqCS: Support = %d, Threshold %d
\n ", freq, freqThreshold);
+ if (*freq == freqThreshold){
+ freqCS = creatCS(subjId, num, key);
+ addCStoSet(freqCSset, *freqCS);
+ }
}
}
}
@@ -501,6 +578,12 @@ CSBats* initCSBats(void){
if (csBats->hsKeyBat == NULL) {
return NULL;
}
+
+ (void)BATprepareHash(BATmirror(csBats->hsKeyBat));
+ if (!(csBats->hsKeyBat->T->hash)){
+ return NULL;
+ }
+
csBats->hsValueBat = BATnew(TYPE_void, TYPE_int, smallbatsz);
if (csBats->hsValueBat == NULL) {
@@ -536,12 +619,10 @@ void freeCSBats(CSBats *csBats){
}
-/* Extract CS from SPO triples table */
-str
-RDFextractCSwithTypes(int *ret, bat *sbatid, bat *pbatid, bat *obatid, int
*freqThreshold){
+static
+str RDFassignCSId(int *ret, BAT *sbat, BATiter si, BATiter pi, BATiter oi,
CSset *freqCSset, int *freqThreshold, CSBats* csBats, oid *subjCSMap){
+
BUN p, q;
- BAT *sbat = NULL, *pbat = NULL, *obat = NULL;
- BATiter si, pi, oi; /*iterator for BAT of s,p,o columns in spo
table */
oid *sbt, *pbt, *obt;
oid curS; /* current Subject oid */
oid curP; /* current Property oid */
@@ -550,45 +631,11 @@ RDFextractCSwithTypes(int *ret, bat *sba
oid* buff;
int INIT_PROPERTY_NUM = 5000;
int maxNumProp = 0;
- CSset *freqCSset; /* Set of frequent CSs */
oid objType;
-
- CSBats *csBats;
- oid *subjCSMap; /* Store the correspoinding CS Id for each
subject */
- BUN *maxSoid;
oid returnCSid;
buff = (oid *) malloc (sizeof(oid) * INIT_PROPERTY_NUM);
- if ((sbat = BATdescriptor(*sbatid)) == NULL) {
- throw(MAL, "rdf.RDFextractCSwithTypes", RUNTIME_OBJECT_MISSING);
- }
- if (!(sbat->tsorted)){
- throw(MAL, "rdf.RDFextractCSwithTypes", "sbat is not sorted");
- }
-
- if ((pbat = BATdescriptor(*pbatid)) == NULL) {
- throw(MAL, "rdf.RDFextractCSwithTypes", RUNTIME_OBJECT_MISSING);
- }
- if ((obat = BATdescriptor(*obatid)) == NULL) {
- throw(MAL, "rdf.RDFextractCSwithTypes", RUNTIME_OBJECT_MISSING);
- }
-
- maxSoid = (BUN *) Tloc(sbat, BUNlast(sbat) - 1);
-
- subjCSMap = (oid *) malloc (sizeof(oid) * ((*maxSoid) + 1));
- initArray(subjCSMap, (*maxSoid), GDK_oid_max);
-
- si = bat_iterator(sbat);
- pi = bat_iterator(pbat);
- oi = bat_iterator(obat);
-
-
- csBats = initCSBats();
-
-
- freqCSset = initCSset();
_______________________________________________
checkin-list mailing list
[email protected]
http://mail.monetdb.org/mailman/listinfo/checkin-list