Changeset: 4d57d94c1068 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=4d57d94c1068
Modified Files:
monetdb5/extras/rdf/rdf_shredder.c
monetdb5/extras/rdf/rdfschema.c
monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:
Analyze and filter multi-valued columns.
diffs (241 lines):
diff --git a/monetdb5/extras/rdf/rdf_shredder.c
b/monetdb5/extras/rdf/rdf_shredder.c
--- a/monetdb5/extras/rdf/rdf_shredder.c
+++ b/monetdb5/extras/rdf/rdf_shredder.c
@@ -243,20 +243,20 @@ getObjectType(unsigned char* objStr, BUN
*realNumValue = BUN_NONE;
if (strlen((str)objStr) > 20){
- endpart = objStr + (strlen((str)objStr) - 19);
//XMLSchema#dateTime>
- //printf("Original: %s --> substring: %s \n", (str)objStr,
(str)endpart);
+ endpart = objStr + (strlen((str)objStr) - 19); /*
XMLSchema#dateTime> */
+ /* printf("Original: %s --> substring: %s \n", (str)objStr,
(str)endpart); */
if ( (pos = strstr((str)endpart , "XMLSchema#date>")) != NULL
|| (pos = strstr((str)endpart, "XMLSchema#dateTime>")) != NULL ){
obType = DATETIME;
- //printf("%s: DateTime \n", objStr);
+ /* printf("%s: DateTime \n", objStr); */
}
else if ((pos = strstr((str) endpart, "XMLSchema#int>")) !=
NULL || (pos = strstr((str)endpart, "XMLSchema#integer>")) != NULL){
obType = INTEGER;
valuepart = substring((char*)objStr, 2 , (int) (pos -
(str)objStr - 28));
- //printf("%s: Integer \n. Length of value %d ==> value
%s \n", objStr, (int) (pos - (str)objStr - 28), valuepart);
+ /* printf("%s: Integer \n. Length of value %d ==> value
%s \n", objStr, (int) (pos - (str)objStr - 28), valuepart); */
if (isInt(valuepart) == 1){ /* Check whether the
real value is an integer */
*realNumValue = (BUN) atoi(valuepart);
- //printf("Real value is: " BUNFMT " \n",
*realNumValue);
+ /* printf("Real value is: " BUNFMT " \n",
*realNumValue); */
}
else
obType = STRING;
@@ -268,11 +268,11 @@ getObjectType(unsigned char* objStr, BUN
|| (pos = strstr((str) endpart,
"XMLSchema#double>")) != NULL
|| (pos = strstr((str) endpart,
"XMLSchema#decimal>")) != NULL){
obType = FLOAT;
- //printf("%s: Float \n", objStr);
+ /* printf("%s: Float \n", objStr); */
}
else {
obType = STRING;
- //printf("%s: String \n", objStr);
+ /* printf("%s: String \n", objStr); */
}
}
else
@@ -758,6 +758,7 @@ RDFParser (BAT **graph, str *location, s
dp = opendir (pdata->location);
if (dp != NULL){
while ((ep = readdir (dp)) != NULL){
+ printf("Checking file %s
\n",ep->d_name);
if (strstr (ep->d_name,".nt")!=
NULL || strstr (ep->d_name,".ttl")!= NULL ){
sprintf(tmpfilename,"%s%s",pdata->location,ep->d_name);
printf("Loading file %s
..",tmpfilename);
diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -714,6 +714,11 @@ CSPropTypes* initCSPropTypes(CSset* freq
csPropTypes[id].lstPropTypes = (PropTypes*)
GDKmalloc(sizeof(PropTypes) * csPropTypes[id].numProp);
for (j = 0; j < csPropTypes[id].numProp; j++){
csPropTypes[id].lstPropTypes[j].prop =
freqCSset->items[i].lstProp[j];
+ #if STAT_ANALYZE
+ csPropTypes[id].lstPropTypes[j].numNull = 0;
+ csPropTypes[id].lstPropTypes[j].numMVType = 0;
+ csPropTypes[id].lstPropTypes[j].numSingleType =
0;
+ #endif
csPropTypes[id].lstPropTypes[j].propFreq = 0;
csPropTypes[id].lstPropTypes[j].propCover = 0;
csPropTypes[id].lstPropTypes[j].numType =
MULTIVALUES + 1;
@@ -783,22 +788,85 @@ void genCSPropTypesColIdx(CSPropTypes* c
}
+
+}
+
+static
+void printCSPropTypes(CSPropTypes* csPropTypes, int numMergedCS, CSset*
freqCSset, int freqThreshold){
+ char filename[100];
+ char tmpStr[50];
+ FILE *fout;
+ int i, j, k;
+ int numMVCS = 0;
+ int numMVCSFilter = 0;
+ int numMVCols = 0;
+ int numMVColsFilter = 0;
+ int numNonMVCS = 0;
+ char tmpIsMVCS = 0;
+ char tmpIsMVCSFilter = 0;
+ double threshold = 1.1;
+ double tmpRatio;
+
+ strcpy(filename, "csPropTypes");
+ sprintf(tmpStr, "%d", freqThreshold);
+ strcat(filename, tmpStr);
+ strcat(filename, ".txt");
+
+ fout = fopen(filename,"wt");
+
/* Print cspropTypes */
for (i = 0; i < numMergedCS; i++){
- printf("MergedCS %d (Freq: %d): \n", i,
freqCSset->items[csPropTypes[i].freqCSId].support);
+ fprintf(fout, "MergedCS %d (Freq: %d): \n", i,
freqCSset->items[csPropTypes[i].freqCSId].support);
+ tmpIsMVCS = 0;
+ tmpIsMVCSFilter = 0;
for(j = 0; j < csPropTypes[i].numProp; j++){
- printf(" P " BUNFMT "(%d | cov:%d):",
csPropTypes[i].lstPropTypes[j].prop,
csPropTypes[i].lstPropTypes[j].defaultType,csPropTypes[i].lstPropTypes[j].propCover);
+ if (csPropTypes[i].lstPropTypes[j].numMVType > 0){
+ tmpIsMVCS = 1;
+ numMVCols++;
+ }
+ tmpRatio = (double)
(csPropTypes[i].lstPropTypes[j].propCover /
(csPropTypes[i].lstPropTypes[j].numSingleType +
csPropTypes[i].lstPropTypes[j].numMVType));
+
+ if ((csPropTypes[i].lstPropTypes[j].numMVType > 0) &&
(tmpRatio > threshold)){
+ tmpIsMVCSFilter = 1;
+ numMVColsFilter++;
+ }
+
+ fprintf(fout, " P " BUNFMT "(%d | cov:%d | Null: %d |
Single: %d | Multi: %d) \n",
+ csPropTypes[i].lstPropTypes[j].prop,
csPropTypes[i].lstPropTypes[j].defaultType,csPropTypes[i].lstPropTypes[j].propCover,
+ csPropTypes[i].lstPropTypes[j].numNull,
csPropTypes[i].lstPropTypes[j].numSingleType,
csPropTypes[i].lstPropTypes[j].numMVType);
+ fprintf(fout, " ");
for (k = 0; k < csPropTypes[i].lstPropTypes[j].numType;
k++){
- printf(" Type %d (%d) | ", k,
csPropTypes[i].lstPropTypes[j].lstFreq[k]);
+ fprintf(fout, " Type %d (%d) | ", k,
csPropTypes[i].lstPropTypes[j].lstFreq[k]);
}
- printf("\n");
- printf(" ");
+ fprintf(fout, "\n");
+ fprintf(fout, " ");
for (k = 0; k < csPropTypes[i].lstPropTypes[j].numType;
k++){
- printf(" Tbl %d (cl%d) | ",
csPropTypes[i].lstPropTypes[j].TableTypes[k],
csPropTypes[i].lstPropTypes[j].colIdxes[k]);
+ fprintf(fout, " Tbl %d (cl%d) | ",
csPropTypes[i].lstPropTypes[j].TableTypes[k],
csPropTypes[i].lstPropTypes[j].colIdxes[k]);
}
- printf("\n");
+ fprintf(fout, "\n");
+ }
+
+ if (tmpIsMVCS == 1){
+ numMVCS++;
+ }
+
+ if (tmpIsMVCSFilter == 1){
+ numMVCSFilter++;
}
}
+ numNonMVCS = numMergedCS - numMVCS;
+ fprintf(fout, "Number of tables with MV col: %d \n", numMVCS);
+ fprintf(fout, "Number of tables with NO MV col: %d \n", numNonMVCS);
+ fprintf(fout, "Number of MV cols: %d \n", numMVCols);
+
+ fprintf(fout, "==== With filtering ==== \n");
+ fprintf(fout, "Number of tables with MV col: %d \n", numMVCSFilter);
+ fprintf(fout, "Number of tables with NO MV col: %d \n", (numMergedCS -
numMVCSFilter));
+ fprintf(fout, "Number of MV cols: %d \n", numMVColsFilter);
+
+
+ fclose(fout);
+
}
/*
* Add types of properties
@@ -817,6 +885,9 @@ void addPropTypes(char *buffTypes, oid*
for (i = 0; i < numP; i++){
//printf(" P: " BUNFMT " Type: %d ", buffP[i],
buffTypes[i]);
while (csPropTypes[tblId].lstPropTypes[j].prop !=
buffP[i]){
+ #if STAT_ANALYZE
+ csPropTypes[tblId].lstPropTypes[j].numNull++;
+ #endif
j++;
}
//j is position of the property buffP[i] in
csPropTypes[tblId]
@@ -824,8 +895,24 @@ void addPropTypes(char *buffTypes, oid*
csPropTypes[tblId].lstPropTypes[j].propFreq++;
csPropTypes[tblId].lstPropTypes[j].propCover +=
buffCover[i];
csPropTypes[tblId].lstPropTypes[j].lstFreq[(int)buffTypes[i]]++;
+ #if STAT_ANALYZE
+ if (buffTypes[i] == MULTIVALUES){
+ csPropTypes[tblId].lstPropTypes[j].numMVType++;
+ }
+ else{
+
csPropTypes[tblId].lstPropTypes[j].numSingleType++;
+ }
+ #endif
+
+ j++;
}
+ #if STAT_ANALYZE
+ while (j < csPropTypes[tblId].numProp){
+ csPropTypes[tblId].lstPropTypes[j].numNull++;
+ j++;
+ }
+ #endif
}
//printf("\n");
}
@@ -3450,10 +3537,12 @@ RDFextractCSwithTypes(int *ret, bat *sba
csIdFreqIdxMap = (int *) malloc (sizeof(int) * (*maxCSoid + 1));
initcsIdFreqIdxMap(csIdFreqIdxMap, *maxCSoid + 1, -1, freqCSset);
printf("Using ontologies with %d ontattributesCount and %d
ontmetadataCount \n",ontattributesCount,ontmetadataCount);
-
+
labels = createLabels(freqCSset, csrelSet, *maxCSoid + 1, sbat, si, pi,
oi, *subjCSMap, mbat, csIdFreqIdxMap, *freqThreshold, ontattributes,
ontattributesCount, ontmetadata, ontmetadataCount);
-
+ curT = clock();
+ printf("Done labeling!!! Took %f seconds.\n", ((float)(curT -
tmpLastT))/CLOCKS_PER_SEC);
+ tmpLastT = curT;
getMaximumFreqCSs(freqCSset, labels, csBats->coverageBat,
csBats->freqBat, *maxCSoid + 1, &numMaxCSs);
@@ -4244,6 +4333,7 @@ RDFreorganize(int *ret, CStableStat *cst
csPropTypes = initCSPropTypes(freqCSset, numTables);
RDFExtractCSPropTypes(ret, sbat, si, pi, oi, subjCSMap,
csTblIdxMapping, csPropTypes, maxNumPwithDup);
genCSPropTypesColIdx(csPropTypes, numTables, freqCSset);
+ printCSPropTypes(csPropTypes, numTables, freqCSset, *freqThreshold);
// Init CStableStat
initCStables(cstablestat, freqCSset, csPropTypes, numTables);
diff --git a/monetdb5/extras/rdf/rdfschema.h b/monetdb5/extras/rdf/rdfschema.h
--- a/monetdb5/extras/rdf/rdfschema.h
+++ b/monetdb5/extras/rdf/rdfschema.h
@@ -93,6 +93,8 @@ typedef struct PropStat {
#define FULL_PROP_STAT 1 // Only use for showing the statistic on all
properties / all CSs. (Default should be 0)
+#define STAT_ANALYZE 1 // Only use for collecting the statistic on the number
of multi/null/single-valued prop
+
#define USE_LABEL_FINDING_MAXCS 0 // Use the labels received from
labeling process for finding maxCS
#define USE_LABEL_FOR_MERGING 0 // Use the labels received from
labeling process for finding mergeCS
@@ -229,6 +231,11 @@ typedef struct CStableStat {
typedef struct PropTypes{
oid prop;
int numType;
+#if STAT_ANALYZE
+ int numMVType; /* Number of subjects having this property a
multi-valued prop. */
+ int numNull; /* Number of subjects that don't have obj value
for this prop */
+ int numSingleType; /* Number of subjects having the */
+#endif
int propFreq; /* without considering type = Table frequency*/
int propCover; /* = coverage of that property */
char* lstTypes;
_______________________________________________
checkin-list mailing list
[email protected]
http://mail.monetdb.org/mailman/listinfo/checkin-list