Changeset: 59246b0623d0 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=59246b0623d0
Modified Files:
monetdb5/extras/rdf/rdfschema.c
monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:
Use percentage of the total CS's frequency as the threshold for detecting
dimension tables
diffs (60 lines):
diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -432,10 +432,17 @@ void updateFreqCStype(CSset *freqCSset,
int i;
int numDimensionCS = 0;
-
+ int totalSupport = 0; /* Total CS frequency */
+ float threshold = 0.0;
+
+ for (i = 0; i < num; i++){
+ totalSupport += freqCSset->items[i].support;
+ }
+ threshold = (float)totalSupport * IR_DIMENSION_THRESHOLD_PERCENTAGE;
+ printf("Total support %d --> Threshold for dimension table is: %f \n",
totalSupport, threshold);
for (i = 0; i < num; i++){
if (refCount[i] < freqCSset->items[i].support) continue;
- if (curIRScores[i] < IR_DIMENSION_THRESHOLD) continue;
+ if (curIRScores[i] < threshold) continue;
freqCSset->items[i].type = DIMENSIONCS;
//printf("A dimension CS with IR score = %f \n",
curIRScores[i]);
@@ -6571,7 +6578,10 @@ RDFreorganize(int *ret, CStableStat *cst
// print labels
printf("Start exporting labels \n");
+
+ #if EXPORT_LABEL
exportLabels(labels, freqCSset, csRelMergeFreqSet, *freqThreshold);
+ #endif
curT = clock();
diff --git a/monetdb5/extras/rdf/rdfschema.h b/monetdb5/extras/rdf/rdfschema.h
--- a/monetdb5/extras/rdf/rdfschema.h
+++ b/monetdb5/extras/rdf/rdfschema.h
@@ -125,9 +125,10 @@ typedef struct PropStat {
/* ---- For detecting dimension table */
#define NUM_ITERATION_FOR_IR 3 /* Number of iteration for
indirect referrences to a CS (table) */
-#define IR_DIMENSION_THRESHOLD 100000 /* Score of indirect references that
the CS can be considered as a dimension CS
- Number of IR references should be
several times larger than the CS frequency
- */
+#define IR_DIMENSION_THRESHOLD_PERCENTAGE 0.02 /* Score of indirect
references that the CS can be considered as a dimension CS
+
IR_DIMENSION_THRESHOLD_PERCENTAGE * totalFrequency
+ Number of IR
references should be several times larger than the CS frequency
+ */
#define NOT_MERGE_DIMENSIONCS 1
#define FILTER_INFREQ_FK_FOR_IR 1 /* We filter out all
the dirty references from a CS */
@@ -211,6 +212,8 @@ typedef struct SubCSSet{
#define MIN_FK_FREQUENCY 0.1 // The frequency of a FK should
be > MIN_FK_FREQUENCY * The frequency of a mergedCS (or the number of tuples in
one table)
#define MIN_FK_PROPCOVERAGE 0.9 // The FK needs to happen in
MIN_FK_PROPCOVERAGE of all instances of the particular property
+#define EXPORT_LABEL 0 /* Export labels: TODO: Only disable
the */
+
typedef struct CSset{
CS* items;
int numOrigFreqCS;
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list