Changeset: 68e780e2c1f0 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=68e780e2c1f0
Modified Files:
monetdb5/extras/rdf/rdflabels.c
monetdb5/extras/rdf/rdflabels.h
monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:
Fix a bug in choosing best name from ontology candidates.
Reason: Wrongly copy the code so that bestOntCandIdx is not computed before.
- Add a bit more strict for choosing the name based on not-so-good type value
in the case that no good type/ontology/fk name is found.
diffs (73 lines):
diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -2319,12 +2319,7 @@ void getTableName(CSlabel* label, CSset*
// TODO: Improve this score a bit, by choosing the higher tfidf score,
than number of matched prop
if (choosenOntologyTypeValue == BUN_NONE && isGoodTypeExist == 0 &&
resultCount[csIdx] >= 1){
- label->name = result[csIdx][bestOntCandIdx];
- nameFound = 1;
- #if INFO_WHERE_NAME_FROM
- label->isOntology = 1;
- #endif
-
+
// Only put ontology-based class to the candidate if it is
choosen as the class name
{
int maxNumMatchedProp = -1;
@@ -2341,6 +2336,14 @@ void getTableName(CSlabel* label, CSset*
}
label->candidatesCount += resultCount[csIdx];
}
+
+
+ label->name = result[csIdx][bestOntCandIdx];
+ nameFound = 1;
+ #if INFO_WHERE_NAME_FROM
+ label->isOntology = 1;
+ #endif
+
}
@@ -2387,12 +2390,14 @@ void getTableName(CSlabel* label, CSset*
}
}
-
//if no name is found, check again the typecount to assign a name
#if USE_BEST_TYPEVALUE_INSTEADOF_DUMMY
if (!nameFound){
for (i = 0; i < typeAttributesCount; ++i){
if (typeAttributesHistogramCount[csIdx][i] == 0)
continue;
+
+ if (typeAttributesHistogram[csIdx][i][0].percent <
MIN_POSSIBLE_TYPE_FREQ_THRESHOLD) continue;
+
//printf("Current candidate count =
%d",label->candidatesCount);
label->candidatesType = 1;
label->candidates = GDKrealloc(label->candidates,
sizeof(oid));
diff --git a/monetdb5/extras/rdf/rdflabels.h b/monetdb5/extras/rdf/rdflabels.h
--- a/monetdb5/extras/rdf/rdflabels.h
+++ b/monetdb5/extras/rdf/rdflabels.h
@@ -105,7 +105,8 @@ enum {
#define USE_TABLE_NAME 1 // calculate and store the final labels
#define SHOW_CANDIDATES 0 // inserts a row in UML diagrams to
show all candidate names
#define ONLY_USE_ONTOLOGYBASED_TYPE 0
-#define USE_BEST_TYPEVALUE_INSTEADOF_DUMMY 1 //Use the most frequent type
value instead of a dummy for the label name
+#define USE_BEST_TYPEVALUE_INSTEADOF_DUMMY 1 //Use the most frequent type
value instead of a dummy for the label name
+#define MIN_POSSIBLE_TYPE_FREQ_THRESHOLD 20 //However, that type must still
appears in more than a minimum threshold
rdf_export void
getPropNameShort(char** name, char* propStr);
diff --git a/monetdb5/extras/rdf/rdfschema.h b/monetdb5/extras/rdf/rdfschema.h
--- a/monetdb5/extras/rdf/rdfschema.h
+++ b/monetdb5/extras/rdf/rdfschema.h
@@ -261,7 +261,7 @@ typedef struct SubCSSet{
//#define INFREQ_PROP_THRESHOLD 0.2 //For Testing
#define REMOVE_INFREQ_PROP 1
#define REMOVE_LOTSOFNULL_SUBJECT 1
-#define LOTSOFNULL_SUBJECT_THRESHOLD 0.2
+#define LOTSOFNULL_SUBJECT_THRESHOLD 0.1
#define MIN_FK_FREQUENCY 0.1 // The frequency of a FK should
be > MIN_FK_FREQUENCY * The frequency of a mergedCS (or the number of tuples in
one table)
#define MIN_FK_PROPCOVERAGE 0.9 // The FK needs to happen in
MIN_FK_PROPCOVERAGE of all instances of the particular property
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list