Changeset: 268c8c805182 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=268c8c805182
Modified Files:
monetdb5/extras/rdf/rdfschema.c
monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:
Collect statistics on CS properties' types which will be used for creating
relational columns.
Optimize and fix bug while using labels for finding maxCS and mergedCS
diffs (truncated from 460 to 300 lines):
diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -683,6 +683,86 @@ void printSubCSInformation(SubCSSet *sub
}
}
+/*
+ * Init property types for each CS in FreqCSset (after merging)
+ * For each property, init with all possible types (MULTIVALUES + 1))
+ *
+ * */
+static
+CSPropTypes* initCSPropTypes(CSset* freqCSset, int numMergedCS){
+ int numFreqCS = freqCSset->numCSadded;
+ int i, j, k ;
+ int id;
+
+ CSPropTypes* csPropTypes = (CSPropTypes*)GDKmalloc(sizeof(CSPropTypes)
* numMergedCS);
+
+ id = 0;
+ for (i = 0; i < numFreqCS; i++){
+ if (freqCSset->items[i].parentFreqIdx == -1){ // Only use the
maximum or merge CS
+ csPropTypes[id].freqCSId = i;
+ csPropTypes[id].numProp = freqCSset->items[i].numProp;
+ csPropTypes[id].lstPropTypes = (PropTypes*)
GDKmalloc(sizeof(PropTypes) * csPropTypes[id].numProp);
+ for (j = 0; j < csPropTypes[id].numProp; j++){
+ csPropTypes[id].lstPropTypes[j].prop =
freqCSset->items[i].lstProp[j];
+ csPropTypes[id].lstPropTypes[j].numType =
MULTIVALUES + 1;
+ csPropTypes[id].lstPropTypes[j].lstTypes =
(char*)GDKmalloc(sizeof(char) * csPropTypes[id].lstPropTypes[j].numType);
+ csPropTypes[id].lstPropTypes[j].lstFreq =
(int*)GDKmalloc(sizeof(int) * csPropTypes[id].lstPropTypes[j].numType);
+ for (k = 0; k <
csPropTypes[id].lstPropTypes[j].numType; k++){
+
csPropTypes[id].lstPropTypes[j].lstFreq[k] = 0;
+ }
+
+ }
+
+ id++;
+ }
+ }
+
+ assert(id == numMergedCS);
+
+ return csPropTypes;
+}
+
+
+/*
+ * Add types of properties
+ * Note that the property list is sorted by prop's oids
+ * E.g., buffP = {3, 5, 7}
+ * csPropTypes[tbIdx] contains properties {1,3,4,5,7} with types for each
property and frequency of each <property, type>
+ * */
+static
+void addPropTypes(char *buffTypes, oid* buffP, int numP, int csId, int*
csTblIdxMapping, CSPropTypes* csPropTypes){
+ int i,j;
+ int tblId = csTblIdxMapping[csId];
+
+ j = 0;
+ if (tblId != -1){
+ for (i = 0; i < numP; i++){
+ while (csPropTypes[tblId].lstPropTypes[j].prop !=
buffP[i]){
+ j++;
+ }
+ //j is position of the property buffP[i] in
csPropTypes[tblId]
+
csPropTypes[tblId].lstPropTypes[j].lstFreq[(int)buffTypes[i]]++;
+
+ }
+ }
+}
+
+static
+void freeCSPropTypes(CSPropTypes* csPropTypes, int numCS){
+ int i,j;
+
+ for (i = 0; i < numCS; i++){
+ if (csPropTypes[i].freqCSId != -1){
+ for (j = 0; j < csPropTypes[i].numProp; j++){
+ free(csPropTypes[i].lstPropTypes[j].lstTypes);
+ free(csPropTypes[i].lstPropTypes[j].lstFreq);
+ }
+ free(csPropTypes[i].lstPropTypes);
+ }
+ }
+ free(csPropTypes);
+}
+
static
SubCS* creatSubCS(oid subCSId, int numP, char* buff, oid subCSsign){
SubCS *subcs = (SubCS*) malloc(sizeof(SubCS));
@@ -1854,6 +1934,7 @@ void getMaximumFreqCSs(CSset *freqCSset,
int* coverage;
int* freq;
char isLabelComparable = 0;
+ char isDiffLabel = 0;
(void) labels;
(void) isLabelComparable;
@@ -1866,34 +1947,30 @@ void getMaximumFreqCSs(CSset *freqCSset,
if (strcmp(labels[i].name, "DUMMY") != 0) isLabelComparable = 1;
for (j = (i+1); j < numFreqCS; j++){
- if (freqCSset->items[j].numProp >
freqCSset->items[i].numProp){
- if (isSubset(freqCSset->items[j].lstProp,
freqCSset->items[i].lstProp,
-
freqCSset->items[j].numProp,freqCSset->items[i].numProp) == 1) {
- /* CSj is a superset of CSi */
- #if USE_LABEL_FINDING_MAXCS
- if (isLabelComparable == 1 &&
strcmp(labels[i].name, labels[j].name) == 0) {
-
freqCSset->items[i].parentFreqIdx = j;
- break;
+ isDiffLabel = 0;
+ #if USE_LABEL_FINDING_MAXCS
+ if (isLabelComparable == 0 || strcmp(labels[i].name,
labels[j].name) != 0) {
+ isDiffLabel = 1;
+ }
+ #endif
+
+ if (isDiffLabel == 0){
+ if (freqCSset->items[j].numProp >
freqCSset->items[i].numProp){
+ if
(isSubset(freqCSset->items[j].lstProp, freqCSset->items[i].lstProp,
+
freqCSset->items[j].numProp,freqCSset->items[i].numProp) == 1) {
+ /* CSj is a superset of CSi */
+
freqCSset->items[i].parentFreqIdx = j;
+ break;
}
- #else
- freqCSset->items[i].parentFreqIdx = j;
- #endif
- break;
}
- }
- else if (freqCSset->items[j].numProp <
freqCSset->items[i].numProp){
- if (isSubset(freqCSset->items[i].lstProp,
freqCSset->items[j].lstProp,
-
freqCSset->items[i].numProp,freqCSset->items[j].numProp) == 1) {
- /* CSj is a subset of CSi */
- #if USE_LABEL_FINDING_MAXCS
- if (isLabelComparable == 1 &&
strcmp(labels[i].name, labels[j].name) == 0) {
-
freqCSset->items[j].parentFreqIdx = i;
- }
- #else
- freqCSset->items[j].parentFreqIdx = i;
- #endif
- }
-
+ else if (freqCSset->items[j].numProp <
freqCSset->items[i].numProp){
+ if
(isSubset(freqCSset->items[i].lstProp, freqCSset->items[j].lstProp,
+
freqCSset->items[i].numProp,freqCSset->items[j].numProp) == 1) {
+ /* CSj is a subset of CSi */
+
freqCSset->items[j].parentFreqIdx = i;
+ }
+
+ }
}
//Do not need to consider the case that the numProps
are the same
@@ -2864,6 +2941,73 @@ str RDFrelationships(int *ret, BAT *sbat
return MAL_SUCCEED;
}
+
+
+static
+str RDFExtractCSPropTypes(int *ret, BAT *sbat, BATiter si, BATiter pi, BATiter
oi,
+ oid *subjCSMap, int* csTblIdxMapping, CSPropTypes* csPropTypes,
int maxNumPwithDup){
+
+ BUN p, q;
+ oid *sbt = 0, *obt, *pbt;
+ oid curS; /* current Subject oid */
+ //oid CSoid = 0; /* Characteristic set oid */
+ int numPwithDup; /* Number of properties for current S */
+ char objType;
+ char* buffTypes;
+ oid* buffP;
+ oid curP;
+
+ buffTypes = (char *) malloc(sizeof(char) * (maxNumPwithDup + 1));
+ buffP = (oid *) malloc(sizeof(char) * (maxNumPwithDup + 1));
+
+ numPwithDup = 0;
+ curS = 0;
+ curP = 0;
+
+ BATloop(sbat, p, q){
+ sbt = (oid *) BUNtloc(si, p);
+ if (*sbt != curS){
+ if (p != 0){ /* Not the first S */
+ addPropTypes(buffTypes, buffP, numPwithDup,
subjCSMap[curS], csTblIdxMapping, csPropTypes);
+ }
+ curS = *sbt;
+ numPwithDup = 0;
+ curP = 0;
+ }
+
+ obt = (oid *) BUNtloc(oi, p);
+ /* Check type of object */
+ objType = (char) ((*obt) >> (sizeof(BUN)*8 - 4)) & 7 ;
/* Get two bits 63th, 62nd from object oid */
+
+ pbt = (oid *) BUNtloc(pi, p);
+
+ if (curP == *pbt){
+ #if USE_MULTIPLICITY == 1
+ // Update the object type for this P as MULTIVALUES
+ buffTypes[numPwithDup-1] = MULTIVALUES;
+ #else
+ buffTypes[numPwithDup] = objType;
+ numPwithDup++;
+ #endif
+ }
+ else{
+ buffTypes[numPwithDup] = objType;
+ buffP[numPwithDup] = *pbt;
+ numPwithDup++;
+ curP = *pbt;
+ }
+ }
+
+ /* Check for the last CS */
+ addPropTypes(buffTypes, buffP, numPwithDup, subjCSMap[curS],
csTblIdxMapping, csPropTypes);
+
+ free (buffTypes);
+
+ *ret = 1;
+
+ return MAL_SUCCEED;
+}
+
static
void initCsRelBetweenMergeFreqSet(CSmergeRel *csRelBetweenMergeFreqSet, int
num){
int i;
@@ -3057,7 +3201,7 @@ int ontmetadataCount = 0;
/* Extract CS from SPO triples table */
str
-RDFextractCSwithTypes(int *ret, bat *sbatid, bat *pbatid, bat *obatid, bat
*mapbatid, int *freqThreshold, void *_freqCSset, oid **subjCSMap, oid
*maxCSoid, char **subjdefaultMap){
+RDFextractCSwithTypes(int *ret, bat *sbatid, bat *pbatid, bat *obatid, bat
*mapbatid, int *freqThreshold, void *_freqCSset, oid **subjCSMap, oid
*maxCSoid, char **subjdefaultMap,int *maxNumPwithDup){
BAT *sbat = NULL, *pbat = NULL, *obat = NULL, *mbat = NULL;
BATiter si, pi, oi; /*iterator for BAT of s,p,o columns in
spo table */
@@ -3067,7 +3211,6 @@ RDFextractCSwithTypes(int *ret, bat *sba
BUN *maxSoid;
int maxNumProp = 0;
- int maxNumPwithDup = 0;
char *csFreqMap;
CSrel *csrelSet;
CSrel *csrelToMaxFreqSet, *csrelFromMaxFreqSet;
@@ -3084,6 +3227,7 @@ RDFextractCSwithTypes(int *ret, bat *sba
clock_t curT;
clock_t tmpLastT;
+
Labels *labels;
if ((sbat = BATdescriptor(*sbatid)) == NULL) {
@@ -3134,11 +3278,12 @@ RDFextractCSwithTypes(int *ret, bat *sba
tmpLastT = clock();
+ *maxNumPwithDup = 0;
//Phase 1: Assign an ID for each CS
#if STOREFULLCS
- RDFassignCSId(ret, sbat, si, pi, oi, freqCSset, freqThreshold, csBats,
*subjCSMap, maxCSoid, &maxNumProp, &maxNumPwithDup);
+ RDFassignCSId(ret, sbat, si, pi, oi, freqCSset, freqThreshold, csBats,
*subjCSMap, maxCSoid, &maxNumProp, maxNumPwithDup);
#else
- RDFassignCSId(ret, sbat, si, pi, freqCSset, freqThreshold, csBats,
*subjCSMap, maxCSoid, &maxNumProp, &maxNumPwithDup);
+ RDFassignCSId(ret, sbat, si, pi, freqCSset, freqThreshold, csBats,
*subjCSMap, maxCSoid, &maxNumProp, maxNumPwithDup);
#endif
curT = clock();
@@ -3149,7 +3294,7 @@ RDFextractCSwithTypes(int *ret, bat *sba
printf("Max CS oid: " BUNFMT "\n", *maxCSoid);
- printf("Max Number of P (considering duplicated P): %d \n",
maxNumPwithDup);
+ printf("Max Number of P (considering duplicated P): %d \n",
*maxNumPwithDup);
csFreqMap = (char*) malloc(sizeof(char) * (*maxCSoid +1));
initCharArray(csFreqMap, *maxCSoid +1, 0);
@@ -3163,7 +3308,7 @@ RDFextractCSwithTypes(int *ret, bat *sba
csSubCSSet = initCS_SubCSSets(*maxCSoid +1);
- RDFrelationships(ret, sbat, si, pi, oi, *subjCSMap, subjSubCSMap,
csSubCSSet, csrelSet, *maxSoid, maxNumPwithDup);
+ RDFrelationships(ret, sbat, si, pi, oi, *subjCSMap, subjSubCSMap,
csSubCSSet, csrelSet, *maxSoid, *maxNumPwithDup);
curT = clock();
printf (" ----- Exploring subCSs and FKs took %f seconds.\n",
((float)(curT - tmpLastT))/CLOCKS_PER_SEC);
@@ -3229,6 +3374,7 @@ RDFextractCSwithTypes(int *ret, bat *sba
curT = clock();
printf (" ----- Merging Frequent CSs took %f seconds.\n",
((float)(curT - tmpLastT))/CLOCKS_PER_SEC);
tmpLastT = curT;
+
csRelBetweenMergeFreqSet = (CSmergeRel *) malloc (sizeof(CSmergeRel) *
freqCSset->numCSadded);
initCsRelBetweenMergeFreqSet(csRelBetweenMergeFreqSet,
freqCSset->numCSadded);
@@ -3438,23 +3584,15 @@ str triplesubsort(BAT **sbat, BAT **pbat
}
static
-void initCStablesAndIdxMapping(CStableStat* cstablestat, CSset* freqCSset,
int* csTblIdxMapping, int* mfreqIdxTblIdxMapping, int* mTblIdxFreqIdxMapping){
_______________________________________________
checkin-list mailing list
[email protected]
http://mail.monetdb.org/mailman/listinfo/checkin-list