Changeset: 6fe23ef21e32 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=6fe23ef21e32
Modified Files:
        monetdb5/extras/rdf/rdflabels.c
        monetdb5/extras/rdf/rdfschema.c
        monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:

Remove subject that have lots of missing properties (lots of NULLs) in a table.

Only keep the ontology-based candidate if it is used as table name.


diffs (194 lines):

diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -2310,22 +2310,6 @@ void getTableName(CSlabel* label, CSset*
        // add all ontology candidates to list of candidates
        // Find the best candidate by looking at the number of matched prop
        // between the CS and the ontology candidate
-       // 
-       if (resultCount[csIdx] >= 1) {
-               int maxNumMatchedProp = -1;
-               bestOntCandIdx = 0;
-               label->candidatesOntology = resultCount[csIdx];
-               label->candidates = GDKrealloc(label->candidates, sizeof(oid) * 
(label->candidatesCount + resultCount[csIdx]));
-               if (!label->candidates) fprintf(stderr, "ERROR: Couldn't 
realloc memory!\n");
-               for (i = 0; i < resultCount[csIdx]; ++i) {
-                       label->candidates[label->candidatesCount + i] = 
result[csIdx][i];
-                       if (resultMatchedProp[csIdx][i] > maxNumMatchedProp){
-                               maxNumMatchedProp = resultMatchedProp[csIdx][i];
-                               bestOntCandIdx = i;
-                       }
-               }
-               label->candidatesCount += resultCount[csIdx];
-       }
        
        // If the name found previously (based on the type values) is not 
        // an ontology-based value (e.g., simply a string), and not a really 
good (so frequent) type value 
@@ -2340,6 +2324,23 @@ void getTableName(CSlabel* label, CSset*
                #if INFO_WHERE_NAME_FROM
                label->isOntology = 1; 
                #endif
+               
+               // Only put ontology-based class to the candidate if it is 
choosen as the class name
+               {
+               int maxNumMatchedProp = -1;
+               bestOntCandIdx = 0;
+               label->candidatesOntology = resultCount[csIdx];
+               label->candidates = GDKrealloc(label->candidates, sizeof(oid) * 
(label->candidatesCount + resultCount[csIdx]));
+               if (!label->candidates) fprintf(stderr, "ERROR: Couldn't 
realloc memory!\n");
+               for (i = 0; i < resultCount[csIdx]; ++i) {
+                       label->candidates[label->candidatesCount + i] = 
result[csIdx][i];
+                       if (resultMatchedProp[csIdx][i] > maxNumMatchedProp){
+                               maxNumMatchedProp = resultMatchedProp[csIdx][i];
+                               bestOntCandIdx = i;
+                       }
+               }
+               label->candidatesCount += resultCount[csIdx];
+               }
        }
 
 
diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -7768,7 +7768,7 @@ str printFinalStructure(CStableStat* cst
 #endif
 
 static
-void initCSTableIdxMapping(CSset* freqCSset, int* csTblIdxMapping, int* 
mfreqIdxTblIdxMapping, int* mTblIdxFreqIdxMapping, int *numTables, CSlabel 
*labels){
+void initCSTableIdxMapping(CSset* freqCSset, int* csTblIdxMapping, int* 
csFreqCSMapping, int* mfreqIdxTblIdxMapping, int* mTblIdxFreqIdxMapping, int 
*numTables, CSlabel *labels){
 
 int            i, k; 
 CS             cs;
@@ -7789,6 +7789,7 @@ CS                cs;
        
        for (i = 0; i < freqCSset->numOrigFreqCS; i++){
                cs = (CS)freqCSset->items[i];
+               csFreqCSMapping[cs.csId] = i; 
                tmpParentidx = cs.parentFreqIdx;
                
                if (tmpParentidx == -1){        // maximumCS 
@@ -8078,12 +8079,16 @@ RDFextractCSwithTypes(int *ret, bat *sba
 
        {
        int numTables = 0; 
-       int *csTblIdxMapping, *mfreqIdxTblIdxMapping, *mTblIdxFreqIdxMapping;
+       int *csTblIdxMapping, *mfreqIdxTblIdxMapping, *mTblIdxFreqIdxMapping, 
*csFreqCSMapping;
        
 
        csTblIdxMapping = (int *) malloc (sizeof (int) * (*maxCSoid + 1)); 
        initIntArray(csTblIdxMapping, (*maxCSoid + 1), -1);
 
+       csFreqCSMapping = (int *) malloc (sizeof (int) * (*maxCSoid + 1));
+       initIntArray(csFreqCSMapping, (*maxCSoid + 1), -1);
+
+
        mfreqIdxTblIdxMapping = (int *) malloc (sizeof (int) * 
freqCSset->numCSadded); 
        initIntArray(mfreqIdxTblIdxMapping , freqCSset->numCSadded, -1);
 
@@ -8092,7 +8097,7 @@ RDFextractCSwithTypes(int *ret, bat *sba
 
        //Mapping from from CSId to TableIdx 
        printf("Init CS tableIdxMapping \n");
-       initCSTableIdxMapping(freqCSset, csTblIdxMapping, 
mfreqIdxTblIdxMapping, mTblIdxFreqIdxMapping, &numTables, *labels);
+       initCSTableIdxMapping(freqCSset, csTblIdxMapping, csFreqCSMapping, 
mfreqIdxTblIdxMapping, mTblIdxFreqIdxMapping, &numTables, *labels);
 
 
        #if NO_OUTPUTFILE == 0 
@@ -9500,12 +9505,17 @@ RDFreorganize(int *ret, CStableStat *cst
        oid             *lastSubjId;    /* Store the last subject Id in each 
freqCS */
        //oid           *lastSubjIdEx;  /* Store the last subject Id (of 
not-default type) in each freqCS */
        int             tblIdx; 
+       #if REMOVE_LOTSOFNULL_SUBJECT
+       int             freqIdx;                
+       int             numSubjRemoved = 0;
+       #endif
        oid             lastS;
        oid             l,r; 
        bat             oNewBatid, pNewBatid; 
        int             *csTblIdxMapping;       /* Store the mapping from a CS 
id to an index of a maxCS or mergeCS in freqCSset. */
        int             *mfreqIdxTblIdxMapping;  /* Store the mapping from the 
idx of a max/merge freqCS to the table Idx */
        int             *mTblIdxFreqIdxMapping;  /* Invert of 
mfreqIdxTblIdxMapping */
+       int             *csFreqCSMapping = NULL; 
        int             numTables = 0; 
        PropStat        *propStat; 
        int             numdistinctMCS = 0; 
@@ -9542,7 +9552,9 @@ RDFreorganize(int *ret, CStableStat *cst
 
        csTblIdxMapping = (int *) malloc (sizeof (int) * (maxCSoid + 1)); 
        initIntArray(csTblIdxMapping, (maxCSoid + 1), -1);
-
+       
+       csFreqCSMapping = (int *) malloc (sizeof (int) * (maxCSoid + 1));
+       initIntArray(csFreqCSMapping, (maxCSoid + 1), -1);
 
        mfreqIdxTblIdxMapping = (int *) malloc (sizeof (int) * 
freqCSset->numCSadded); 
        initIntArray(mfreqIdxTblIdxMapping , freqCSset->numCSadded, -1);
@@ -9552,7 +9564,7 @@ RDFreorganize(int *ret, CStableStat *cst
 
        //Mapping from from CSId to TableIdx 
        printf("Init CS tableIdxMapping \n");
-       initCSTableIdxMapping(freqCSset, csTblIdxMapping, 
mfreqIdxTblIdxMapping, mTblIdxFreqIdxMapping, &numTables, labels);
+       initCSTableIdxMapping(freqCSset, csTblIdxMapping, csFreqCSMapping, 
mfreqIdxTblIdxMapping, mTblIdxFreqIdxMapping, &numTables, labels);
 
 
        if ((sbat = BATdescriptor(*sbatid)) == NULL) {
@@ -9642,6 +9654,7 @@ RDFreorganize(int *ret, CStableStat *cst
                freeCSset(freqCSset); 
                free(subjCSMap);
                free(csTblIdxMapping);
+               free(csFreqCSMapping);
                free(mfreqIdxTblIdxMapping);
                free(mTblIdxFreqIdxMapping);
                freeCSPropTypes(csPropTypes,numTables);
@@ -9684,9 +9697,17 @@ RDFreorganize(int *ret, CStableStat *cst
        BATloop(sbat, p, q){
                sbt = (oid *) BUNtloc(si, p);
                tblIdx = csTblIdxMapping[subjCSMap[*sbt]];
-       
+
+               #if REMOVE_LOTSOFNULL_SUBJECT
+               freqIdx = csFreqCSMapping[subjCSMap[*sbt]];
+               if (freqCSset->items[freqIdx].numProp < 
cstablestat->lstcstable[tblIdx].numCol * LOTSOFNULL_SUBJECT_THRESHOLD){
+                       tblIdx = -1;
+                       numSubjRemoved++;
+               }
+               #endif                  
+
                if (tblIdx != -1){
-
+                       
                        if (lastS != *sbt){     //new subject
                                lastS = *sbt; 
                                
@@ -9715,7 +9736,9 @@ RDFreorganize(int *ret, CStableStat *cst
                
        }
 
-
+        #if REMOVE_LOTSOFNULL_SUBJECT
+       printf("Number of subject removed is: %d \n", numSubjRemoved);
+       #endif
        //BATprint(VIEWcreate(BATmirror(lmap),rmap)); 
        
        origobat = getOriginalUriOBat(obat);    //Return obat without 
type-specific information for URI & BLANKNODE
@@ -9791,6 +9814,7 @@ RDFreorganize(int *ret, CStableStat *cst
        freeCSset(freqCSset); 
        free(subjCSMap); 
        free(csTblIdxMapping);
+       free(csFreqCSMapping);
        free(mfreqIdxTblIdxMapping);
        free(mTblIdxFreqIdxMapping);
        free(lastSubjId);
diff --git a/monetdb5/extras/rdf/rdfschema.h b/monetdb5/extras/rdf/rdfschema.h
--- a/monetdb5/extras/rdf/rdfschema.h
+++ b/monetdb5/extras/rdf/rdfschema.h
@@ -260,6 +260,8 @@ typedef struct SubCSSet{
 #define        INFREQ_PROP_THRESHOLD   0.01
 //#define      INFREQ_PROP_THRESHOLD   0.2     //For Testing
 #define REMOVE_INFREQ_PROP     1
+#define REMOVE_LOTSOFNULL_SUBJECT      1
+#define        LOTSOFNULL_SUBJECT_THRESHOLD    0.2
 
 #define        MIN_FK_FREQUENCY        0.1     // The frequency of a FK should 
be > MIN_FK_FREQUENCY * The frequency of a mergedCS (or the number of tuples in 
one table)      
 #define MIN_FK_PROPCOVERAGE    0.9     // The FK needs to happen in 
MIN_FK_PROPCOVERAGE of all instances of the particular property
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to