Changeset: 6fe23ef21e32 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=6fe23ef21e32
Modified Files:
monetdb5/extras/rdf/rdflabels.c
monetdb5/extras/rdf/rdfschema.c
monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:
Remove subject that have lots of missing properties (lots of NULLs) in a table.
Only keep the ontology-based candidate if it is used as table name.
diffs (194 lines):
diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -2310,22 +2310,6 @@ void getTableName(CSlabel* label, CSset*
// add all ontology candidates to list of candidates
// Find the best candidate by looking at the number of matched prop
// between the CS and the ontology candidate
- //
- if (resultCount[csIdx] >= 1) {
- int maxNumMatchedProp = -1;
- bestOntCandIdx = 0;
- label->candidatesOntology = resultCount[csIdx];
- label->candidates = GDKrealloc(label->candidates, sizeof(oid) *
(label->candidatesCount + resultCount[csIdx]));
- if (!label->candidates) fprintf(stderr, "ERROR: Couldn't
realloc memory!\n");
- for (i = 0; i < resultCount[csIdx]; ++i) {
- label->candidates[label->candidatesCount + i] =
result[csIdx][i];
- if (resultMatchedProp[csIdx][i] > maxNumMatchedProp){
- maxNumMatchedProp = resultMatchedProp[csIdx][i];
- bestOntCandIdx = i;
- }
- }
- label->candidatesCount += resultCount[csIdx];
- }
// If the name found previously (based on the type values) is not
// an ontology-based value (e.g., simply a string), and not a really
good (so frequent) type value
@@ -2340,6 +2324,23 @@ void getTableName(CSlabel* label, CSset*
#if INFO_WHERE_NAME_FROM
label->isOntology = 1;
#endif
+
+ // Only put ontology-based class to the candidate if it is
choosen as the class name
+ {
+ int maxNumMatchedProp = -1;
+ bestOntCandIdx = 0;
+ label->candidatesOntology = resultCount[csIdx];
+ label->candidates = GDKrealloc(label->candidates, sizeof(oid) *
(label->candidatesCount + resultCount[csIdx]));
+ if (!label->candidates) fprintf(stderr, "ERROR: Couldn't
realloc memory!\n");
+ for (i = 0; i < resultCount[csIdx]; ++i) {
+ label->candidates[label->candidatesCount + i] =
result[csIdx][i];
+ if (resultMatchedProp[csIdx][i] > maxNumMatchedProp){
+ maxNumMatchedProp = resultMatchedProp[csIdx][i];
+ bestOntCandIdx = i;
+ }
+ }
+ label->candidatesCount += resultCount[csIdx];
+ }
}
diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -7768,7 +7768,7 @@ str printFinalStructure(CStableStat* cst
#endif
static
-void initCSTableIdxMapping(CSset* freqCSset, int* csTblIdxMapping, int*
mfreqIdxTblIdxMapping, int* mTblIdxFreqIdxMapping, int *numTables, CSlabel
*labels){
+void initCSTableIdxMapping(CSset* freqCSset, int* csTblIdxMapping, int*
csFreqCSMapping, int* mfreqIdxTblIdxMapping, int* mTblIdxFreqIdxMapping, int
*numTables, CSlabel *labels){
int i, k;
CS cs;
@@ -7789,6 +7789,7 @@ CS cs;
for (i = 0; i < freqCSset->numOrigFreqCS; i++){
cs = (CS)freqCSset->items[i];
+ csFreqCSMapping[cs.csId] = i;
tmpParentidx = cs.parentFreqIdx;
if (tmpParentidx == -1){ // maximumCS
@@ -8078,12 +8079,16 @@ RDFextractCSwithTypes(int *ret, bat *sba
{
int numTables = 0;
- int *csTblIdxMapping, *mfreqIdxTblIdxMapping, *mTblIdxFreqIdxMapping;
+ int *csTblIdxMapping, *mfreqIdxTblIdxMapping, *mTblIdxFreqIdxMapping,
*csFreqCSMapping;
csTblIdxMapping = (int *) malloc (sizeof (int) * (*maxCSoid + 1));
initIntArray(csTblIdxMapping, (*maxCSoid + 1), -1);
+ csFreqCSMapping = (int *) malloc (sizeof (int) * (*maxCSoid + 1));
+ initIntArray(csFreqCSMapping, (*maxCSoid + 1), -1);
+
+
mfreqIdxTblIdxMapping = (int *) malloc (sizeof (int) *
freqCSset->numCSadded);
initIntArray(mfreqIdxTblIdxMapping , freqCSset->numCSadded, -1);
@@ -8092,7 +8097,7 @@ RDFextractCSwithTypes(int *ret, bat *sba
//Mapping from from CSId to TableIdx
printf("Init CS tableIdxMapping \n");
- initCSTableIdxMapping(freqCSset, csTblIdxMapping,
mfreqIdxTblIdxMapping, mTblIdxFreqIdxMapping, &numTables, *labels);
+ initCSTableIdxMapping(freqCSset, csTblIdxMapping, csFreqCSMapping,
mfreqIdxTblIdxMapping, mTblIdxFreqIdxMapping, &numTables, *labels);
#if NO_OUTPUTFILE == 0
@@ -9500,12 +9505,17 @@ RDFreorganize(int *ret, CStableStat *cst
oid *lastSubjId; /* Store the last subject Id in each
freqCS */
//oid *lastSubjIdEx; /* Store the last subject Id (of
not-default type) in each freqCS */
int tblIdx;
+ #if REMOVE_LOTSOFNULL_SUBJECT
+ int freqIdx;
+ int numSubjRemoved = 0;
+ #endif
oid lastS;
oid l,r;
bat oNewBatid, pNewBatid;
int *csTblIdxMapping; /* Store the mapping from a CS
id to an index of a maxCS or mergeCS in freqCSset. */
int *mfreqIdxTblIdxMapping; /* Store the mapping from the
idx of a max/merge freqCS to the table Idx */
int *mTblIdxFreqIdxMapping; /* Invert of
mfreqIdxTblIdxMapping */
+ int *csFreqCSMapping = NULL;
int numTables = 0;
PropStat *propStat;
int numdistinctMCS = 0;
@@ -9542,7 +9552,9 @@ RDFreorganize(int *ret, CStableStat *cst
csTblIdxMapping = (int *) malloc (sizeof (int) * (maxCSoid + 1));
initIntArray(csTblIdxMapping, (maxCSoid + 1), -1);
-
+
+ csFreqCSMapping = (int *) malloc (sizeof (int) * (maxCSoid + 1));
+ initIntArray(csFreqCSMapping, (maxCSoid + 1), -1);
mfreqIdxTblIdxMapping = (int *) malloc (sizeof (int) *
freqCSset->numCSadded);
initIntArray(mfreqIdxTblIdxMapping , freqCSset->numCSadded, -1);
@@ -9552,7 +9564,7 @@ RDFreorganize(int *ret, CStableStat *cst
//Mapping from from CSId to TableIdx
printf("Init CS tableIdxMapping \n");
- initCSTableIdxMapping(freqCSset, csTblIdxMapping,
mfreqIdxTblIdxMapping, mTblIdxFreqIdxMapping, &numTables, labels);
+ initCSTableIdxMapping(freqCSset, csTblIdxMapping, csFreqCSMapping,
mfreqIdxTblIdxMapping, mTblIdxFreqIdxMapping, &numTables, labels);
if ((sbat = BATdescriptor(*sbatid)) == NULL) {
@@ -9642,6 +9654,7 @@ RDFreorganize(int *ret, CStableStat *cst
freeCSset(freqCSset);
free(subjCSMap);
free(csTblIdxMapping);
+ free(csFreqCSMapping);
free(mfreqIdxTblIdxMapping);
free(mTblIdxFreqIdxMapping);
freeCSPropTypes(csPropTypes,numTables);
@@ -9684,9 +9697,17 @@ RDFreorganize(int *ret, CStableStat *cst
BATloop(sbat, p, q){
sbt = (oid *) BUNtloc(si, p);
tblIdx = csTblIdxMapping[subjCSMap[*sbt]];
-
+
+ #if REMOVE_LOTSOFNULL_SUBJECT
+ freqIdx = csFreqCSMapping[subjCSMap[*sbt]];
+ if (freqCSset->items[freqIdx].numProp <
cstablestat->lstcstable[tblIdx].numCol * LOTSOFNULL_SUBJECT_THRESHOLD){
+ tblIdx = -1;
+ numSubjRemoved++;
+ }
+ #endif
+
if (tblIdx != -1){
-
+
if (lastS != *sbt){ //new subject
lastS = *sbt;
@@ -9715,7 +9736,9 @@ RDFreorganize(int *ret, CStableStat *cst
}
-
+ #if REMOVE_LOTSOFNULL_SUBJECT
+ printf("Number of subject removed is: %d \n", numSubjRemoved);
+ #endif
//BATprint(VIEWcreate(BATmirror(lmap),rmap));
origobat = getOriginalUriOBat(obat); //Return obat without
type-specific information for URI & BLANKNODE
@@ -9791,6 +9814,7 @@ RDFreorganize(int *ret, CStableStat *cst
freeCSset(freqCSset);
free(subjCSMap);
free(csTblIdxMapping);
+ free(csFreqCSMapping);
free(mfreqIdxTblIdxMapping);
free(mTblIdxFreqIdxMapping);
free(lastSubjId);
diff --git a/monetdb5/extras/rdf/rdfschema.h b/monetdb5/extras/rdf/rdfschema.h
--- a/monetdb5/extras/rdf/rdfschema.h
+++ b/monetdb5/extras/rdf/rdfschema.h
@@ -260,6 +260,8 @@ typedef struct SubCSSet{
#define INFREQ_PROP_THRESHOLD 0.01
//#define INFREQ_PROP_THRESHOLD 0.2 //For Testing
#define REMOVE_INFREQ_PROP 1
+#define REMOVE_LOTSOFNULL_SUBJECT 1
+#define LOTSOFNULL_SUBJECT_THRESHOLD 0.2
#define MIN_FK_FREQUENCY 0.1 // The frequency of a FK should
be > MIN_FK_FREQUENCY * The frequency of a mergedCS (or the number of tuples in
one table)
#define MIN_FK_PROPCOVERAGE 0.9 // The FK needs to happen in
MIN_FK_PROPCOVERAGE of all instances of the particular property
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list