Changeset: 0b2321795d4d for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=0b2321795d4d
Modified Files:
monetdb5/extras/rdf/rdfschema.c
monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:
Collect list of cs indexes for each property from freqCSset.
This information is used while organizing triple table.
diffs (220 lines):
diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -1785,17 +1785,22 @@ PropStat* initPropStat(void){
propStat->numAdded = 0;
propStat->numAllocation = INIT_PROP_NUM;
-
+
+ // For posting list of each prop
+ propStat->plCSidx = (Postinglist*) malloc(sizeof(Postinglist) *
INIT_PROP_NUM);
+ if (propStat->plCSidx == NULL) return NULL;
+
return propStat;
}
static
-void addaProp(PropStat* propStat, oid prop){
+void addaProp(PropStat* propStat, oid prop, int csIdx){
BUN bun;
BUN p;
int* _tmp1;
float* _tmp2;
+ Postinglist* _tmp3;
p = prop;
bun = BUNfnd(BATmirror(propStat->pBat),(ptr) &prop);
@@ -1824,19 +1829,51 @@ void addaProp(PropStat* propStat, oid pr
}
propStat->tfidfs = (float*)_tmp2;
+
+ _tmp3 = realloc(propStat->plCSidx,
((propStat->numAllocation) * sizeof(Postinglist)));
+ if (!_tmp3){
+ fprintf(stderr, "ERROR: Couldn't realloc
memory!\n");
+ }
+
+ propStat->plCSidx = (Postinglist*)_tmp3;
}
+
propStat->freqs[propStat->numAdded] = 1;
+
+ propStat->plCSidx[propStat->numAdded].lstIdx = (int *)
malloc(sizeof(int) * INIT_CS_PER_PROP);
+ if (propStat->plCSidx[propStat->numAdded].lstIdx == NULL){
+ fprintf(stderr, "ERROR: Couldn't realloc memory!\n");
+ }
+
+ propStat->plCSidx[propStat->numAdded].lstIdx[0] = csIdx;
+ propStat->plCSidx[propStat->numAdded].numAdded = 1;
+ propStat->plCSidx[propStat->numAdded].numAllocation =
INIT_CS_PER_PROP;
+
propStat->numAdded++;
-
}
else{ /*existing p*/
- propStat->freqs[bun]++;
+ propStat->freqs[bun]++;
+
+ if (propStat->plCSidx[bun].numAdded ==
propStat->plCSidx[bun].numAllocation){
+
+ propStat->plCSidx[bun].numAllocation +=
INIT_CS_PER_PROP;
+
+ _tmp1 = realloc(propStat->plCSidx[bun].lstIdx,
((propStat->plCSidx[bun].numAllocation) * sizeof(int)));
+ if (!_tmp1){
+ fprintf(stderr, "ERROR: Couldn't realloc
memory!\n");
+ }
+ propStat->plCSidx[bun].lstIdx = (int*) _tmp1;
+
+ }
+ propStat->plCSidx[bun].lstIdx[propStat->plCSidx[bun].numAdded]
= csIdx;
+ propStat->plCSidx[bun].numAdded++;
}
}
+
static
-void getPropStatistics(PropStat* propStat, int numMaxCSs, oid*
superCSFreqCSMap, CSset* freqCSset){
+void getPropStatisticsFromMaxCSs(PropStat* propStat, int numMaxCSs, oid*
superCSFreqCSMap, CSset* freqCSset){
int i, j;
oid freqId;
@@ -1847,7 +1884,7 @@ void getPropStatistics(PropStat* propSta
cs = (CS)freqCSset->items[freqId];
for (j = 0; j < cs.numProp; j++){
- addaProp(propStat, cs.lstProp[j]);
+ addaProp(propStat, cs.lstProp[j],freqId);
}
}
@@ -1864,11 +1901,68 @@ void getPropStatistics(PropStat* propSta
*/
}
+
+static
+PropStat* getPropStatisticsFromFreqCSs(CSset* freqCSset){
+
+ int i, j;
+ CS cs;
+
+ PropStat* propStat;
+
+ propStat = initPropStat();
+
+ for (i = 0; i < freqCSset->numCSadded; i++){
+
+ if (freqCSset->items[i].parentFreqIdx == -1){ // Only use the
maximum or merge CS
+ cs = (CS)freqCSset->items[i];
+
+ for (j = 0; j < cs.numProp; j++){
+ addaProp(propStat, cs.lstProp[j], i);
+ }
+ }
+ }
+
+ /* Do not calculate the TFIDF score. May need in the future
+ *
+ for (i = 0; i < propStat->numAdded; i++){
+ propStat->tfidfs[i] = tfidfComp(propStat->freqs[i],numMaxCSs);
+ }
+ */
+
+ return propStat;
+}
+
+static
+void printPropStat(PropStat* propStat){
+ int i, j;
+ oid *pbt;
+ Postinglist ps;
+
+ printf("---- PropStat --- \n");
+ for (i = 0; i < propStat->numAdded; i++){
+ pbt = (oid *) Tloc(propStat->pBat, i);
+ printf("Property " BUNFMT " :\n FreqCSIdx: ", *pbt);
+
+ ps = propStat->plCSidx[i];
+ for (j = 0; j < ps.numAdded; j++){
+ printf(" %d",ps.lstIdx[j]);
+ }
+ printf("\n");
+ }
+}
+
static
void freePropStat(PropStat *propStat){
+ int i;
BBPreclaim(propStat->pBat);
free(propStat->freqs);
free(propStat->tfidfs);
+ for (i = 0; i < propStat->numAdded; i++){
+ free(propStat->plCSidx[i].lstIdx);
+ }
+ free(propStat->plCSidx);
+ free(propStat);
}
@@ -1902,7 +1996,7 @@ void mergeMaximumFreqCSsAll(CSset *freqC
propStat = initPropStat();
- getPropStatistics(propStat, numMaxCSs, superCSFreqCSMap, freqCSset);
+ getPropStatisticsFromMaxCSs(propStat, numMaxCSs, superCSFreqCSMap,
freqCSset);
@@ -2958,6 +3052,7 @@ RDFreorganize(int *ret, bat *sbatid, bat
oid l,r;
bat oNewBatid, pNewBatid;
oid *csMFreqCSMap; /* Store the mapping from a CS id to an
index of a maxCS or mergeCS in freqCSset. */
+ PropStat *propStat;
freqCSset = initCSset();
@@ -3088,6 +3183,8 @@ RDFreorganize(int *ret, bat *sbatid, bat
BATprint(sNewBat);
+ propStat = getPropStatisticsFromFreqCSs(freqCSset);
+ printPropStat(propStat);
freeCSset(freqCSset);
free(subjCSMap);
diff --git a/monetdb5/extras/rdf/rdfschema.h b/monetdb5/extras/rdf/rdfschema.h
--- a/monetdb5/extras/rdf/rdfschema.h
+++ b/monetdb5/extras/rdf/rdfschema.h
@@ -51,16 +51,24 @@ typedef struct {
BAT* fullPBat; /* Stores all set of properties */
} CSBats; // BATs for storing all information about CSs
+typedef struct Postinglist{
+ int* lstIdx;
+ int numAdded;
+ int numAllocation;
+} Postinglist;
+
/* Statistic about the properties */
typedef struct PropStat {
- BAT* pBat; /* Store the list of properties */
- int* freqs; /* Store number of CSs containing that property */
- float* tfidfs;
- int numAllocation;
- int numAdded;
+ BAT* pBat; /* Store the list of properties */
+ int* freqs; /* Store number of CSs containing that
property */
+ float* tfidfs;
+ int numAllocation;
+ int numAdded;
+ Postinglist* plCSidx; /* Store posting list of CS index */
} PropStat;
#define INIT_PROP_NUM 10
+#define INIT_CS_PER_PROP 10
#define USINGTFIDF 1
#define STOREFULLCS 1 /* Store full instance of a CS including the a
subject and list of predicates, objects.
_______________________________________________
checkin-list mailing list
[email protected]
http://mail.monetdb.org/mailman/listinfo/checkin-list