Changeset: 7b52ce274b79 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=7b52ce274b79
Modified Files:
monetdb5/extras/rdf/rdflabels.c
monetdb5/extras/rdf/rdfparams.c
monetdb5/extras/rdf/rdfparams.h
monetdb5/extras/rdf/rdfschema.c
monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:
More params in params.ini
diffs (truncated from 356 to 300 lines):
diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -2097,7 +2097,7 @@ CSlabel* createLabels(CSset* freqCSset,
#endif
curT = clock();
- printf (" Labeling: Collecting type attributes histogram took %f
seconds.\n", ((float)(curT - tmpLastT))/CLOCKS_PER_SEC);
+ //printf (" Labeling: Collecting type attributes histogram took %f
seconds.\n", ((float)(curT - tmpLastT))/CLOCKS_PER_SEC);
tmpLastT = curT;
// Relation (FK)
@@ -2109,7 +2109,7 @@ CSlabel* createLabels(CSset* freqCSset,
#endif
curT = clock();
- printf (" Labeling: Collecting relationship metatdata count took %f
seconds.\n", ((float)(curT - tmpLastT))/CLOCKS_PER_SEC);
+ //printf (" Labeling: Collecting relationship metatdata count took %f
seconds.\n", ((float)(curT - tmpLastT))/CLOCKS_PER_SEC);
tmpLastT = curT;
// Ontologies
@@ -2127,7 +2127,7 @@ CSlabel* createLabels(CSset* freqCSset,
#endif
curT = clock();
- printf (" Labeling: Collecting ontology lookup results took %f
seconds.\n", ((float)(curT - tmpLastT))/CLOCKS_PER_SEC);
+ //printf (" Labeling: Collecting ontology lookup results took %f
seconds.\n", ((float)(curT - tmpLastT))/CLOCKS_PER_SEC);
tmpLastT = curT;
// Assigning Names
diff --git a/monetdb5/extras/rdf/rdfparams.c b/monetdb5/extras/rdf/rdfparams.c
--- a/monetdb5/extras/rdf/rdfparams.c
+++ b/monetdb5/extras/rdf/rdfparams.c
@@ -31,6 +31,10 @@ int dimensionFactor;
int upperboundNumTables;
float generalityThreshold;
float simTfidfThreshold;
+int minTableSize;
+float infreqTypeThreshold;
+float infreqPropThreshold;
+
void createDefaultParamsFile(void){
@@ -40,8 +44,10 @@ void createDefaultParamsFile(void){
fprintf(paramFile, "dimensionFactor 1000\n");
fprintf(paramFile, "upperboundNumTables 1000\n");
- fprintf(paramFile, "simTfidfThreshold 0.75");
-
+ //fprintf(paramFile, "simTfidfThreshold 0.75");
+ fprintf(paramFile, "minTableSize 1000\n");
+ fprintf(paramFile, "infreqTypeThreshold 0.1\n");
+ fprintf(paramFile, "infreqPropThreshold 0.05\n");
fclose(paramFile);
}
@@ -71,6 +77,18 @@ void readParamsInput(void){
simTfidfThreshold = atof(value);
printf("simTfidfThreshold = %f\n",
simTfidfThreshold);
}
+ else if (strcmp(variable, "minTableSize") == 0){
+ minTableSize = atoi(value);
+ printf("minTableSize = %d\n", minTableSize);
+ }
+ else if (strcmp(variable, "infreqTypeThreshold") == 0){
+ infreqTypeThreshold= atof(value);
+ printf("infreqTypeThreshold= %f\n",
infreqTypeThreshold);
+ }
+ else if (strcmp(variable, "infreqPropThreshold") == 0){
+ infreqPropThreshold= atof(value);
+ printf("infreqPropThreshold= %f\n",
infreqPropThreshold);
+ }
}
}
diff --git a/monetdb5/extras/rdf/rdfparams.h b/monetdb5/extras/rdf/rdfparams.h
--- a/monetdb5/extras/rdf/rdfparams.h
+++ b/monetdb5/extras/rdf/rdfparams.h
@@ -35,6 +35,9 @@ extern int dimensionFactor;
extern int upperboundNumTables;
extern float generalityThreshold;
extern float simTfidfThreshold;
+extern int minTableSize;
+extern float infreqTypeThreshold;
+extern float infreqPropThreshold;
rdf_export void
createDefaultParamsFile(void);
diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -252,10 +252,10 @@ char isCSTable(CS item, oid name){
#if REMOVE_SMALL_TABLE
if (item.support > acceptableTableSize) return 1;
- //if (item.coverage < MINIMUM_TABLE_SIZE) return 0;
+ //if (item.coverage < minTableSize) return 0;
//More strict with table which does not have name
- //if ((name == BUN_NONE) && item.support < MINIMUM_TABLE_SIZE) return
0;
- if (item.support < MINIMUM_TABLE_SIZE) return 0;
+ //if ((name == BUN_NONE) && item.support < minTableSize) return 0;
+ if (item.support < minTableSize) return 0;
#endif
return 1;
@@ -506,7 +506,7 @@ void getOrigRefCount(CSrel *csrelSet, CS
for (j = 0; j < csrelSet[i].numRef; j++){
freqId = csrelSet[i].lstRefFreqIdx[j];
#if FILTER_INFREQ_FK_FOR_IR
- if (csrelSet[i].lstCnt[j] <
INFREQ_TYPE_THRESHOLD * freqCSset->items[freqId].support) continue;
+ if (csrelSet[i].lstCnt[j] < infreqTypeThreshold
* freqCSset->items[freqId].support) continue;
#endif
//Do not count the self-reference
if (freqId != i) refCount[freqId] +=
csrelSet[i].lstCnt[j];
@@ -539,7 +539,7 @@ void getIRNums(CSrel *csrelSet, CSset *f
for (j = 0; j < csrelSet[i].numRef; j++){
freqId = csrelSet[i].lstRefFreqIdx[j];
#if FILTER_INFREQ_FK_FOR_IR
- if (csrelSet[i].lstCnt[j] <
INFREQ_TYPE_THRESHOLD * freqCSset->items[freqId].support) continue;
+ if (csrelSet[i].lstCnt[j] <
infreqTypeThreshold * freqCSset->items[freqId].support) continue;
#endif
if (freqId != i){ //Do not count
the self-reference
//curIRScores[freqId] +=
(lastIRScores[i] * (float)csrelSet[i].lstCnt[j]/(float)refCount[freqId]) +
csrelSet[i].lstCnt[j];
@@ -579,7 +579,7 @@ void updateFreqCStype(CSset *freqCSset,
printf("List of dimension tables: \n");
for (i = 0; i < num; i++){
#if ONLY_SMALLTBL_DIMENSIONTBL
- if (freqCSset->items[i].support > MINIMUM_TABLE_SIZE) continue;
+ if (freqCSset->items[i].support > minTableSize) continue;
#endif
if (refCount[i] < freqCSset->items[i].support) continue;
threshold = freqCSset->items[i].support * ratio;
@@ -870,7 +870,7 @@ char isMultiValueCol(PropTypes pt){
tmpRatio = ((double)pt.propCover / (pt.numSingleType + pt.numMVType));
//printf("NumMVType = %d | Ratio %f \n", pt.numMVType, tmpRatio);
- if ((pt.numMVType > 0) && (tmpRatio > (1 + INFREQ_TYPE_THRESHOLD))){
+ if ((pt.numMVType > 0) && (tmpRatio > (1 + infreqTypeThreshold))){
return 1;
}
else return 0;
@@ -878,7 +878,7 @@ char isMultiValueCol(PropTypes pt){
static
char isInfrequentProp(PropTypes pt, CS cs){
- if (pt.propFreq < cs.support * INFREQ_PROP_THRESHOLD) return 1;
+ if (pt.propFreq < cs.support * infreqPropThreshold) return 1;
else return 0;
}
@@ -978,7 +978,7 @@ void genCSPropTypesColIdx(CSPropTypes* c
defaultIdx = k;
}
//TODO: Check the case of single value
col has a property with multi-valued objects
- if
(csPropTypes[i].lstPropTypes[j].lstFreq[k] <
csPropTypes[i].lstPropTypes[j].propFreq * INFREQ_TYPE_THRESHOLD){
+ if
(csPropTypes[i].lstPropTypes[j].lstFreq[k] <
csPropTypes[i].lstPropTypes[j].propFreq * infreqTypeThreshold){
//non-frequent type goes to PSO
csPropTypes[i].lstPropTypes[j].TableTypes[k] = PSOTBL;
}
@@ -1403,7 +1403,7 @@ int countNumberMergeCS(CSset *csSet){
}
}
- printf("Max number of prop among %d merged CS is: %d \n", num,
maxNumProp);
+ //printf("Max number of prop among %d merged CS is: %d \n", num,
maxNumProp);
return num;
@@ -3646,7 +3646,7 @@ void generatecsRelSum(CSrel csRel, int f
freq = freqCSset->items[csRel.origFreqIdx].support;
referredFreqId = csRel.lstRefFreqIdx[i];
freqOfReferredCS = freqCSset->items[referredFreqId].support;
- if (freq > MIN_FROMTABLE_SIZE_S5 && (((float)freq *
INFREQ_TYPE_THRESHOLD) < csRel.lstCnt[i])
+ if (freq > MIN_FROMTABLE_SIZE_S5 && (((float)freq *
infreqTypeThreshold) < csRel.lstCnt[i])
&& freqOfReferredCS < csRel.lstCnt[i] *
MIN_TO_PERCETAGE_S5){
p = csRel.lstPropId[i];
@@ -3787,8 +3787,8 @@ void buildLabelStatForTable(LabelStat *l
numDummy++;
}
- printf("Collect label stat for final table: Total number of distinct
labels %d \n", labelStat->numLabeladded);
- printf("Number of DUMMY freqCS: %d \n",numDummy);
+ //printf("Collect label stat for final table: Total number of distinct
labels %d \n", labelStat->numLabeladded);
+ //printf("Number of DUMMY freqCS: %d \n",numDummy);
//Build list of freqId corresponding to each label
labelStat->freqIdList = (int**) malloc(sizeof(int*) *
labelStat->numLabeladded);
@@ -3864,8 +3864,8 @@ void buildLabelStatForFinalMergeCS(Label
numDummy++;
}
- printf("Collect label stat for final mergeCS: Total number of distinct
labels %d \n", labelStat->numLabeladded);
- printf("Number of DUMMY freqCS: %d \n",numDummy);
+ //printf("Collect label stat for final mergeCS: Total number of
distinct labels %d \n", labelStat->numLabeladded);
+ //printf("Number of DUMMY freqCS: %d \n",numDummy);
//Build list of freqId corresponding to each label
labelStat->freqIdList = (int**) malloc(sizeof(int*) *
labelStat->numLabeladded);
@@ -3951,8 +3951,8 @@ void buildLabelStat(LabelStat *labelStat
numDummy++;
}
- printf("Total number of distinct labels in Top%d is %d \n", k,
labelStat->numLabeladded);
- printf("Number of DUMMY freqCS: %d \n",numDummy);
+ //printf("Total number of distinct labels in Top%d is %d \n", k,
labelStat->numLabeladded);
+ //printf("Number of DUMMY freqCS: %d \n",numDummy);
//Build list of FreqCS
labelStat->freqIdList = (int**) malloc(sizeof(int*) *
labelStat->numLabeladded);
for (i =0; i < labelStat->numLabeladded; i++){
@@ -4292,7 +4292,7 @@ void mergeFreqCSByS5(CSrel *csrelMergeFr
propStat = initPropStat();
getPropStatisticsFromMergeCSs(propStat, curNumMergeCS,
mergeCSFreqCSMap, freqCSset);
- printf("Start merging CS by using S5[From FK] \n");
+ //printf("Start merging CS by using S5[From FK] \n");
#if NO_OUTPUTFILE == 0
strcpy(filename, "csRelSum.txt");
@@ -4308,8 +4308,8 @@ void mergeFreqCSByS5(CSrel *csrelMergeFr
if (freqCSset->items[freqId].numProp > maxNumPropInMergeCS)
maxNumPropInMergeCS = freqCSset->items[freqId].numProp;
}
- printf("maxNumRefPerCS = %d \n", maxNumRefPerCS);
- printf("max number of prop in mergeCS: %d \n", maxNumPropInMergeCS);
+ //printf("maxNumRefPerCS = %d \n", maxNumRefPerCS);
+ //printf("max number of prop in mergeCS: %d \n", maxNumPropInMergeCS);
csRelSum = initCSrelSum(maxNumPropInMergeCS,maxNumRefPerCS);
@@ -4618,6 +4618,7 @@ void mergeCSByS4(CSset *freqCSset, CSlab
if (simscore > simTfidfThreshold &&
(existDiscriminatingProp || isSameLabel)){
#else
if (simscore > simTfidfThreshold &&
existDiscriminatingProp){
+ //if (simscore > simTfidfThreshold){
#endif
#else
if (simscore > SIM_THRESHOLD) {
@@ -4886,7 +4887,7 @@ static void getStatisticFinalCSs(CSset *
}
for (k = 1; k < 10; k++) {
- if
((csPropTypes[i].lstPropTypes[j].propFreq * k) <
freqCSset->items[freqId].support * INFREQ_PROP_THRESHOLD){
+ if
((csPropTypes[i].lstPropTypes[j].propFreq * k) <
freqCSset->items[freqId].support * infreqPropThreshold){
totalCoverage10[k] =
totalCoverage10[k] - csPropTypes[i].lstPropTypes[j].propCover;
tmpNumProp10[k]--;
};
@@ -8369,7 +8370,7 @@ CSrel* getFKBetweenTableSet(CSrel *csrel
// add relation to new data structure
//Compare with prop coverage from csproptype
- if (rel.lstCnt[j] < freqCSset->items[toFreqId].support
* INFREQ_TYPE_THRESHOLD) continue;
+ if (rel.lstCnt[j] < freqCSset->items[toFreqId].support
* infreqTypeThreshold) continue;
to = mfreqIdxTblIdxMapping[toFreqId];
assert(to != -1);
@@ -8386,8 +8387,8 @@ CSrel* getFKBetweenTableSet(CSrel *csrel
//Filtering: For big size table, if large number of
prop's instances need to refer to a certain table
// else, all instances of that prop must refer to the
certain table
- if (freqCSset->items[i].coverage > MINIMUM_TABLE_SIZE){
- if
(csPropTypes[from].lstPropTypes[propIdx].propCover * (1 -
INFREQ_TYPE_THRESHOLD) > rel.lstCnt[j]) continue;
+ if (freqCSset->items[i].coverage > minTableSize){
+ if
(csPropTypes[from].lstPropTypes[propIdx].propCover * (1 - infreqTypeThreshold)
> rel.lstCnt[j]) continue;
else if
(csPropTypes[from].lstPropTypes[propIdx].propCover == rel.lstCnt[j])
csPropTypes[from].lstPropTypes[propIdx].isDirtyFKProp = 0;
else
@@ -8955,8 +8956,8 @@ Pscore computeMetricsQ(CSset *freqCSset)
}
}
printf("Performance metric Q = (weighting %f)/(totalCov %d * numTbl %d)
\n", Q,totalCov, curNumMergeCS);
- printf("Average precision = %f\n",(float)totalPrecision/curNumMergeCS);
- printf("Overall precision = %f (overfill %lld / overalMaxFill %lld)\n",
(float) overalFill/overalMaxFill, overalFill, overalMaxFill);
+ //printf("Average precision =
%f\n",(float)totalPrecision/curNumMergeCS);
+ //printf("Overall precision = %f (overfill %lld / overalMaxFill
%lld)\n", (float) overalFill/overalMaxFill, overalFill, overalMaxFill);
//printf("Average precision = %f\n",(float)totalPrecision/totalCov);
Q = Q/((float)totalCov * curNumMergeCS);
@@ -9221,7 +9222,7 @@ void RDFmergingTrial(CSset *freqCSset, C
tmpLastT = clock();
curNumMergeCS = countNumberMergeCS(freqCSset);
- printf("Before using rules: Number of freqCS is: %d \n",curNumMergeCS);
+ //printf("Before using rules: Number of freqCS is: %d
\n",curNumMergeCS);
/* ---------- S1 ------- */
mergecsId = maxCSoid + 1;
@@ -9229,7 +9230,7 @@ void RDFmergingTrial(CSset *freqCSset, C
mergeFreqCSByS1(freqCSset, labels, &mergecsId, ontmetadata,
ontmetadataCount, mapbatid); /*S1: Merge all freqCS's sharing top-3 candidates
*/
curNumMergeCS = countNumberMergeCS(freqCSset);
- printf("S1: Number of mergeCS: %d \n", curNumMergeCS);
+ //printf("S1: Number of mergeCS: %d \n", curNumMergeCS);
#if STORE_PERFORMANCE_METRIC_INFO
//computeMetricsQ(freqCSset);
@@ -9248,7 +9249,7 @@ void RDFmergingTrial(CSset *freqCSset, C
freeCSrelSet(tmpCSrelToMergeCS,tmpNumRel);
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list