Changeset: bc147d8e4e56 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=bc147d8e4e56
Modified Files:
monetdb5/extras/rdf/rdfschema.c
monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:
Fix bug while removing subjects with lots of missing properties.
diffs (truncated from 341 to 300 lines):
diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -6002,59 +6002,66 @@ str initFullSampleData(CSSampleExtend *c
int curStrLen =0;
int tmpStrLen =0;
oid *tmpOid = (oid *) BUNtail(tmpi,
ranPosition);
- //tmpOid refer to the keyBat of the mv
bats
-
- //Get the range of multi-values in
keyBat
- tmpmvKeyBat =
cstablestat->lstcstable[i].lstMVTables[j].keyBat;
+ if (*tmpOid != oid_nil){
+
+ //tmpOid refer to the keyBat of
the mv bats
+
+ //Get the range of multi-values
in keyBat
+ tmpmvKeyBat =
cstablestat->lstcstable[i].lstMVTables[j].keyBat;
+
+ mvRefOid = *tmpOid;
+ tmpmvRefOid = (oid *)
Tloc(tmpmvKeyBat, mvRefOid);
+ assert(tmpmvRefOid != NULL);
+
+ //printf("First position for
multivalues in keybat %d \n", (int) (*tmpmvRefOid));
+
+ tmpNumMVCols =
cstablestat->lstcstable[i].lstMVTables[j].numCol;
+ //printf("Table %d colum %d is
a mv col with %d types \n",i,j,tmpNumMVCols);
+
+ tmpPos = *tmpOid;
+ while (*tmpmvRefOid ==
mvRefOid){
+ //Concat the data from
each column
+ for (mvColIdx =0;
mvColIdx < tmpNumMVCols; mvColIdx++){
+ tmpmvBat =
cstablestat->lstcstable[i].lstMVTables[j].mvBats[mvColIdx];
+ tmpObjType =
getObjTypeFromBATtype(tmpmvBat->ttype);
+ if
(getObjValueFromMVBat(&vrRealObjValue, &vrCastedObjValue, tmpPos, tmpObjType,
tmpmvBat, lmap, rmap) == 1){
+
//printf("Casted value at mvBat %d is %s
\n",mvColIdx,vrCastedObjValue.val.sval);
+
tmpStrLen = strlen(vrCastedObjValue.val.sval);
+ if
(tmpMVSampleStr == NULL){
+
tmpMVSampleStr = (str) GDKmalloc(tmpStrLen + 1);
+
s = tmpMVSampleStr;
+ }else{
+
tmpMVSampleStr = (str) GDKrealloc(tmpMVSampleStr, curStrLen + tmpStrLen + 2);
+
s = tmpMVSampleStr;
+
s += curStrLen;
+ }
+
+
strcpy(s, vrCastedObjValue.val.sval);
+ s +=
tmpStrLen;
+ *s++ =
';';
+ *s =
'\0';
+
+
curStrLen = strlen(tmpMVSampleStr);
+
//printf("Current tmpMVSampleStr String %s --> curLen = %d \n",tmpMVSampleStr,
curStrLen);
+
+
VALclear(&vrCastedObjValue);
+
VALclear(&vrRealObjValue);
+ }
+ }
+
+
+ //Get next
+ tmpPos++;
+ if (tmpPos ==
BATcount(tmpmvKeyBat)) break;
+
+ tmpmvRefOid = (oid *)
Tloc(tmpmvKeyBat, tmpPos);
+ }
+
+ }
+ //else{
+ //printf("[Null] There is no
set of multiple values for this subject");
- mvRefOid = *tmpOid;
- tmpmvRefOid = (oid *) Tloc(tmpmvKeyBat,
mvRefOid);
- assert(tmpmvRefOid != NULL);
-
- //printf("First position for
multivalues in keybat %d \n", (int) (*tmpmvRefOid));
-
- tmpNumMVCols =
cstablestat->lstcstable[i].lstMVTables[j].numCol;
- //printf("Table %d colum %d is a mv col
with %d types \n",i,j,tmpNumMVCols);
-
- tmpPos = *tmpOid;
- while (*tmpmvRefOid == mvRefOid){
- //Concat the data from each
column
- for (mvColIdx =0; mvColIdx <
tmpNumMVCols; mvColIdx++){
- tmpmvBat =
cstablestat->lstcstable[i].lstMVTables[j].mvBats[mvColIdx];
- tmpObjType =
getObjTypeFromBATtype(tmpmvBat->ttype);
- if
(getObjValueFromMVBat(&vrRealObjValue, &vrCastedObjValue, tmpPos, tmpObjType,
tmpmvBat, lmap, rmap) == 1){
-
//printf("Casted value at mvBat %d is %s
\n",mvColIdx,vrCastedObjValue.val.sval);
- tmpStrLen =
strlen(vrCastedObjValue.val.sval);
- if
(tmpMVSampleStr == NULL){
-
tmpMVSampleStr = (str) GDKmalloc(tmpStrLen + 1);
- s =
tmpMVSampleStr;
- }else{
-
tmpMVSampleStr = (str) GDKrealloc(tmpMVSampleStr, curStrLen + tmpStrLen + 2);
- s =
tmpMVSampleStr;
- s +=
curStrLen;
- }
-
- strcpy(s,
vrCastedObjValue.val.sval);
- s += tmpStrLen;
- *s++ = ';';
- *s = '\0';
-
- curStrLen =
strlen(tmpMVSampleStr);
-
//printf("Current tmpMVSampleStr String %s --> curLen = %d \n",tmpMVSampleStr,
curStrLen);
-
-
VALclear(&vrCastedObjValue);
-
VALclear(&vrRealObjValue);
- }
- }
-
-
- //Get next
- tmpPos++;
- if (tmpPos ==
BATcount(tmpmvKeyBat)) break;
-
- tmpmvRefOid = (oid *)
Tloc(tmpmvKeyBat, tmpPos);
- }
-
+ //}
if (tmpMVSampleStr != NULL){
tmpMVSampleStr = (str)
GDKrealloc(tmpMVSampleStr, curStrLen + 1);
tmpMVSampleStr[curStrLen] =
'\0';
@@ -8898,7 +8905,7 @@ void getRealValue(ValPtr returnValue, oi
}while (0)
-str RDFdistTriplesToCSs(int *ret, bat *sbatid, bat *pbatid, bat *obatid, bat
*mbatid, bat *lmapbatid, bat *rmapbatid, PropStat* propStat, CStableStat
*cstablestat, CSPropTypes *csPropTypes, oid* lastSubjId){
+str RDFdistTriplesToCSs(int *ret, bat *sbatid, bat *pbatid, bat *obatid, bat
*mbatid, bat *lmapbatid, bat *rmapbatid, PropStat* propStat, CStableStat
*cstablestat, CSPropTypes *csPropTypes, oid* lastSubjId, char *isLotsNullSubj){
BAT *sbat = NULL, *pbat = NULL, *obat = NULL, *mbat = NULL, *lmap =
NULL, *rmap = NULL;
BATiter si,pi,oi, mi;
@@ -8922,6 +8929,8 @@ str RDFdistTriplesToCSs(int *ret, bat *s
int lasttblIdx = -1;
int lastColIdx = -1;
int lastPropIdx = -1;
+ int numEmptyBat = 0;
+
char isSetLasttblIdx = 0;
ObjectType objType, defaultType;
char tmpTableType = 0;
@@ -8954,6 +8963,8 @@ str RDFdistTriplesToCSs(int *ret, bat *s
char isFKCol = 0;
#endif
+ (void) isLotsNullSubj;
+
maxOrigPbt = ((oid)1 << (sizeof(BUN)*8 - NBITS_FOR_CSID)) - 1;
if (TKNZRopen (NULL, &schema) != MAL_SUCCEED) {
throw(RDF, "RDFdistTriplesToCSs",
@@ -9031,11 +9042,20 @@ str RDFdistTriplesToCSs(int *ret, bat *s
//printf(" --> Tbl: %d tmpSoid: " BUNFMT " | Last SubjId "
BUNFMT "\n", tblIdx,tmpSoid, lastSubjId[tblIdx]);
- if (tblIdx == -1){ // This is for irregular triples, put
them to pso table
- insToPSO(cstablestat->pbat,cstablestat->sbat,
cstablestat->obat, pbt, sbt, obt);
- //printf(" ==> To PSO \n");
- isFKCol = 0;
- continue;
+ if (tblIdx == -1){
+ #if REMOVE_LOTSOFNULL_SUBJECT
+ if (isLotsNullSubj[*sbt] == 0){
+ // This is for irregular triples, put them to
pso table
+ insToPSO(cstablestat->pbat,cstablestat->sbat,
cstablestat->obat, pbt, sbt, obt);
+ //printf(" ==> To PSO \n");
+ isFKCol = 0;
+ continue;
+ }
+ #else
+ insToPSO(cstablestat->pbat,cstablestat->sbat,
cstablestat->obat, pbt, sbt, obt);
+ isFKCol = 0;
+ continue;
+ #endif
}
if (*pbt != lastP){
@@ -9068,10 +9088,18 @@ str RDFdistTriplesToCSs(int *ret, bat *s
}
+ #if REMOVE_LOTSOFNULL_SUBJECT
+ if (tblIdx == -1 && isLotsNullSubj[*sbt]){
+ // A lots-of-null subject
+ insToPSO(cstablestat->pbat,cstablestat->sbat,
cstablestat->obat, pbt, sbt, obt);
+
+ continue;
+ }
+ #endif
+
objType = getObjType(*obt);
assert (objType != BLANKNODE);
-
tmpPropIdx = tmpTblIdxPropIdxMap[tblIdx];
//printf(" PropIdx = %d \n", tmpPropIdx);
tmpColIdx =
csPropTypes[tblIdx].lstPropTypes[tmpPropIdx].defColIdx;
@@ -9153,7 +9181,9 @@ str RDFdistTriplesToCSs(int *ret, bat *s
#endif
isSetLasttblIdx = 1;
}
-
+
+
+
/* New column. Finish with lastTblIdx and lastColIdx. Note:
This lastColIdx is
* the position of the prop in a final CS. Not the exact colIdx
in MAINTBL or TYPETBL
* */
@@ -9246,6 +9276,7 @@ str RDFdistTriplesToCSs(int *ret, bat *s
#endif
#if COUNT_DISTINCT_REFERRED_S
if (isFKCol){
+ assert(tmpFKHashBat != NULL);
tmpFKRefBun =
BUNfnd(BATmirror(tmpFKHashBat),(ptr) obt);
if (tmpFKRefBun == BUN_NONE){
@@ -9264,8 +9295,7 @@ str RDFdistTriplesToCSs(int *ret, bat *s
}
#endif
}
-
-
+
if (istmpMVProp == 1){ // This is a multi-valued prop
//printf("Multi values prop \n");
if (*sbt != lastS){
@@ -9452,13 +9482,24 @@ str RDFdistTriplesToCSs(int *ret, bat *s
throw(RDF, "rdf.RDFdistTriplesToCSs", "Problem in filling
missing values all");
}
-
+ numEmptyBat = 0;
// Keep the batCacheId
for (i = 0; i < cstablestat->numTables; i++){
//printf("----- Table %d ------ \n",i );
for (j = 0; j < cstablestat->numPropPerTable[i];j++){
//printf("Column %d \n", j);
cstablestat->lstbatid[i][j] =
cstablestat->lstcstable[i].colBats[j]->batCacheid;
+ tmpBat = cstablestat->lstcstable[i].colBats[j];
+ if (BATcount(tmpBat) == 0) {
+ printf("Empty Bats at table %d column %d
\n",i,j);
+ numEmptyBat++;
+ fillMissingvalues(tmpBat,
(int)BATcount(tmpBat), (int)lastSubjId[i]);
+ }
+ if (j > 0)
+ if
(BATcount(cstablestat->lstcstable[i].colBats[j]) > 0 &&
+
BATcount(cstablestat->lstcstable[i].colBats[j-1]) > 0){
+
assert(BATcount(cstablestat->lstcstable[i].colBats[j]) ==
BATcount(cstablestat->lstcstable[i].colBats[j-1]));
+ }
//BATprint(cstablestat->lstcstable[i].colBats[j]);
if (csPropTypes[i].lstPropTypes[j].isMVProp){
//printf("MV Columns: \n");
@@ -9475,6 +9516,9 @@ str RDFdistTriplesToCSs(int *ret, bat *s
*ret = 1;
printf(" ... Done \n");
+ printf("Number of full empty bats %d \n",numEmptyBat);
+
+ printf("Number of triples in PSO table is "BUNFMT"\n",
BATcount(cstablestat->pbat));
BBPunfix(sbat->batCacheid);
BBPunfix(pbat->batCacheid);
@@ -9509,6 +9553,8 @@ RDFreorganize(int *ret, CStableStat *cst
int freqIdx;
int numSubjRemoved = 0;
#endif
+ char *isLotsNullSubj = NULL;
+
oid lastS;
oid l,r;
bat oNewBatid, pNewBatid;
@@ -9692,17 +9738,33 @@ RDFreorganize(int *ret, CStableStat *cst
lastSubjId = (oid *) malloc (sizeof(oid) * cstablestat->numTables);
initArray(lastSubjId, cstablestat->numTables, -1);
- printf("Re-assigning Subject oids ... ");
+ #if REMOVE_LOTSOFNULL_SUBJECT
+ //TODO: Find the better way than using isLotsNullSubj array to keep
+ //the status of subject
+ isLotsNullSubj = (char *) malloc(sizeof(char) * BATcount(sbat) + 1);
+ initCharArray(isLotsNullSubj, BATcount(sbat) + 1,0);
+ #else
+ (void) isLotsNullSubj;
+ #endif
+
+ printf("Re-assigning Subject oids ... \n");
lastS = -1;
BATloop(sbat, p, q){
sbt = (oid *) BUNtloc(si, p);
tblIdx = csTblIdxMapping[subjCSMap[*sbt]];
-
+
#if REMOVE_LOTSOFNULL_SUBJECT
- freqIdx = csFreqCSMapping[subjCSMap[*sbt]];
- if (freqCSset->items[freqIdx].numProp <
cstablestat->lstcstable[tblIdx].numCol * LOTSOFNULL_SUBJECT_THRESHOLD){
- tblIdx = -1;
- numSubjRemoved++;
+ //TODO: If the subject is the target
+ // of an FK prop, do not remove that subject. This is hard to
check.
+ //
+ if (tblIdx != -1){
+ freqIdx = csFreqCSMapping[subjCSMap[*sbt]];
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list