Changeset: 76e64ee7921f for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=76e64ee7921f
Modified Files:
monetdb5/extras/rdf/rdfschema.c
monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:
Check the number of subject having no discriminating prop in final table.
diffs (140 lines):
diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -3237,12 +3237,9 @@ PropStat* getPropStatisticsByTable(int n
propStat->maxNumPPerCS = cs.numProp;
}
- /* Do not calculate the TFIDF score. May need in the future
- *
for (i = 0; i < propStat->numAdded; i++){
- propStat->tfidfs[i] = tfidfComp(propStat->freqs[i],numMaxCSs);
- }
- */
+ propStat->tfidfs[i] = tfidfComp(propStat->freqs[i],numTables);
+ }
*numdistinctMCS = k;
@@ -5261,7 +5258,6 @@ str RDFassignCSId(int *ret, BAT *sbat, B
return MAL_SUCCEED;
}
-
static
str RDFgetRefCounts(int *ret, BAT *sbat, BATiter si, BATiter pi, BATiter oi,
oid *subjCSMap, int maxNumProp, BUN maxSoid, int *refCount){
@@ -7904,6 +7900,18 @@ void computeMetricsQForRefinedTable(CSse
int tmpNumFreqProps;
int *numRefinedFills = NULL;
int *numRefinedSupport = NULL;
+ #if NO_OUTPUTFILE == 0
+ PropStat *propStat = NULL;
+ int numdistinctMCS = 0;
+ int numSubjWithoutDiscProp = 0;
+ int numTriplesWihtoutDiscProp = 0;
+ char isContainedDiscProp = 0;
+ oid p;
+ oid *pbt;
+ BUN bun;
+ FILE *fout;
+ char filename[100];
+ #endif
fillRatio = (float*)malloc(sizeof(float) * numTables);
refRatio = (float*)malloc(sizeof(float) * numTables);
@@ -7917,6 +7925,25 @@ void computeMetricsQForRefinedTable(CSse
numRefinedSupport[i] =
freqCSset->items[mTblIdxFreqIdxMapping[i]].support;
}
+
+ #if NO_OUTPUTFILE == 0
+
+ propStat = getPropStatisticsByTable(numTables, mTblIdxFreqIdxMapping,
freqCSset, &numdistinctMCS);
+ //Print the TF-IDF score of each prop in each table
+
+ strcpy(filename,"propStatWithFinalSchema.txt");
+ fout = fopen(filename,"wt");
+ fprintf(fout, "PropertyOid #ofCSs tfidfscore");
+ for (i = 0; i < propStat->numAdded; i++){
+ pbt = (oid *) Tloc(propStat->pBat, i);
+ fprintf(fout, BUNFMT " %d %f \n", *pbt,
propStat->plCSidx[i].numAdded,propStat->tfidfs[i]);
+ }
+ fclose(fout);
+ #endif
+
+
+
+
//Removing LOTSOFNULL_SUBJECT_THRESHOLD
//Check which freqCS having small number of prop
//--> they will be removed from the final table.
@@ -7961,7 +7988,38 @@ void computeMetricsQForRefinedTable(CSse
}
- }
+
+ #if NO_OUTPUTFILE == 0
+ //Get the number of subject having no discriminating props in
final Table
+ cs = freqCSset->items[i];
+
+ isContainedDiscProp = 0;
+ for (j = 0; j < cs.numProp; j++){
+ p = cs.lstProp[j];
+ bun = BUNfnd(BATmirror(propStat->pBat),(ptr) &p);
+ if (bun == BUN_NONE) {
+ printf("FreqCS: %d, prop "BUNFMT" --> This prop
must be in propStat!!!!\n",i,p);
+ }
+ else{
+ if (propStat->tfidfs[bun] >
MIN_TFIDF_PROP_FINALTABLE) {
+ isContainedDiscProp = 1;
+ break;
+ }
+ }
+ }
+ if (isContainedDiscProp == 0){ //There is no discriminating
prop in this CS
+ numSubjWithoutDiscProp += cs.support;
+ numTriplesWihtoutDiscProp += cs.coverage;
+ }
+
+
+ #endif
+ }
+
+ #if NO_OUTPUTFILE == 0
+ printf("Number of Subject having no discriminating props is:
%d\n",numSubjWithoutDiscProp);
+ printf(" ==> Removing these subject will remove %d triples
\n",numTriplesWihtoutDiscProp);
+ #endif
for (i = 0; i < numTables; i++){
tmpFinalFreqIdx = mTblIdxFreqIdxMapping[i];
@@ -7995,6 +8053,9 @@ void computeMetricsQForRefinedTable(CSse
free(weight);
free(numRefinedFills);
free(numRefinedSupport);
+ #if NO_OUTPUTFILE == 0
+ freePropStat(propStat);
+ #endif
}
#endif
diff --git a/monetdb5/extras/rdf/rdfschema.h b/monetdb5/extras/rdf/rdfschema.h
--- a/monetdb5/extras/rdf/rdfschema.h
+++ b/monetdb5/extras/rdf/rdfschema.h
@@ -251,6 +251,7 @@ typedef struct SubCSSet{
#define MIN_TFIDF_PROP_S4 3.5 // When we merge two CS's based on the
tf-idf/consine similarity score, we want
// to make sure that we do not merge two CS's
that may have same set of really common properties
// such as type, description. They should have
at least one discriminating prop in common.
+#define MIN_TFIDF_PROP_FINALTABLE 2.5 //Discriminating prop is prop that
appears in less than 10% of the table
//#define MIN_FROMTABLE_SIZE_S5 1 /* For example data */
#define MINIMUM_TABLE_SIZE 10000 //The minimum number of triples coverred by
a table (i.e., a final CS)
@@ -264,6 +265,7 @@ typedef struct SubCSSet{
#define REMOVE_INFREQ_PROP 1
#define REMOVE_LOTSOFNULL_SUBJECT 1
#define LOTSOFNULL_SUBJECT_THRESHOLD 0.1
+#define DETECT_INCORRECT_TYPE_SUBJECT 1 //Detect subjects that are
assigned wrong type
#define MIN_FK_FREQUENCY 0.1 // The frequency of a FK should
be > MIN_FK_FREQUENCY * The frequency of a mergedCS (or the number of tuples in
one table)
#define MIN_FK_PROPCOVERAGE 0.9 // The FK needs to happen in
MIN_FK_PROPCOVERAGE of all instances of the particular property
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list