Changeset: ba10cc54c6de for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=ba10cc54c6de
Modified Files:
        monetdb5/extras/rdf/rdfschema.c
        monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:

Detect subject (may be wrongly typed) based on discriminating prop. of that 
type.

If the subject marked with a type, but contains discriminating property from 
other type, it can be wrongly typed.

+ Add a function for printing out all the merged freqCS with the prop support 
and mark REALLY INFREQUENT prop.


diffs (truncated from 558 to 300 lines):

diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -933,7 +933,7 @@ void printCSPropTypes(CSPropTypes* csPro
 
        /* Print cspropTypes */
        for (i = 0; i < numMergedCS; i++){
-               fprintf(fout, "MergedCS %d (Freq: %d): \n", i, 
freqCSset->items[csPropTypes[i].freqCSId].support);
+               fprintf(fout, "MergedCS %d (Freq: %d ) \n", i, 
freqCSset->items[csPropTypes[i].freqCSId].support);
                tmpIsMVCS = 0;
                tmpIsMVCSFilter = 0; 
                for(j = 0; j < csPropTypes[i].numProp; j++){
@@ -2153,7 +2153,82 @@ str printMergedFreqCSSet(CSset *freqCSse
        
        return MAL_SUCCEED;
 }
-#endif
+
+
+//Do not remove infrequent prop form final table
+static 
+str printFinalTableWithPropSupport(CSPropTypes* csPropTypes, int numTables, 
CSset *freqCSset, bat *mapbatid, int freqThreshold, CSlabel* labels){
+
+       int     i,j; 
+       int     freq; 
+       int     freqId; 
+       FILE    *fout; 
+       char    filename[100];
+       char    tmpStr[20];
+       BAT     *mapbat = NULL; 
+       BATiter mapi; 
+       str     propStr; 
+       int     ret; 
+       char*   schema = "rdf";
+       CS      cs;
+       
+       (void) mapi;
+       if (TKNZRopen (NULL, &schema) != MAL_SUCCEED) {
+               throw(RDF, "rdf.rdfschema",
+                               "could not open the tokenizer\n");
+       }
+       
+       if ((mapbat = BATdescriptor(*mapbatid)) == NULL) {
+               throw(MAL, "rdf.RDFextractCSwithTypes", RUNTIME_OBJECT_MISSING);
+       }
+       mapi = bat_iterator(mapbat); 
+       
+       strcpy(filename, "finalfreqCSFullInfoWithPropSupport");
+       sprintf(tmpStr, "%d", freqThreshold);
+       strcat(filename, tmpStr);
+       strcat(filename, ".txt");
+
+       fout = fopen(filename,"wt"); 
+
+       for (i = 0; i < numTables; i++){
+               freqId = csPropTypes[i].freqCSId;
+               cs = (CS)freqCSset->items[freqId];
+               freq = cs.support; 
+
+               if (labels[freqId].name == BUN_NONE) {
+                       fprintf(fout,"Table %d - FreqId %d - Name: %s  (Freq: 
%d | Coverage: %d) \n", i, freqId, "DUMMY", freq, cs.coverage);
+               } else {
+                       str labelStr;
+                       
+                       getStringName(labels[freqId].name, &labelStr, mapi, 
mapbat, 1); 
+                       fprintf(fout,"Table %d - FreqId %d - Name: %s  (Freq: 
%d | Coverage: %d) \n", i, freqId, labelStr, freq, cs.coverage);
+                       GDKfree(labelStr); 
+               }
+
+
+               for (j = 0; j < cs.numProp; j++){
+                       takeOid(cs.lstProp[j], &propStr);
+                       //fprintf(fout, "  P:" BUNFMT " --> ", cs.lstProp[j]);  
+                       fprintf(fout, "  P(" BUNFMT ") %s | PropFreq: %d ", 
cs.lstProp[j],propStr, csPropTypes[i].lstPropTypes[j].propFreq);    
+                       if (csPropTypes[i].lstPropTypes[j].propFreq < 
STRANGE_PROP_FREQUENCY){
+                               fprintf(fout, " [REALLY INFREQUENT PROP] ");
+                       }
+                       GDKfree(propStr);
+                       fprintf(fout, "\n");
+
+               }       
+               fprintf(fout, "\n");
+       }
+
+       fclose(fout);
+
+       BBPunfix(mapbat->batCacheid);
+       TKNZRclose(&ret);
+       
+       return MAL_SUCCEED;
+}
+
+#endif /*  NO_OUTPUTFILE == 0 */
 
 /*
 static 
@@ -3459,6 +3534,165 @@ oid getMostSuitableName(CSlabel *labels,
 }
 #endif
 
+#if DETECT_INCORRECT_TYPE_SUBJECT
+
+#if USING_FINALTABLE
+static
+void buildLabelStatForTable(LabelStat *labelStat, int numTables, CStableStat* 
cstablestat){
+       int     i; 
+       BUN     bun; 
+       int     *_tmp; 
+       int     numDummy = 0;
+       oid     name; 
+       int     tblIdx = -1;
+
+       //Preparation
+       for (i = 0; i  < numTables; i++){
+               if ( cstablestat->lstcstable[i].tblname != BUN_NONE){
+                       name = cstablestat->lstcstable[i].tblname;
+                       bun = BUNfnd(BATmirror(labelStat->labelBat),(ptr) 
&name);
+                       if (bun == BUN_NONE) {
+                               //New string
+                               if (labelStat->labelBat->T->hash && 
BATcount(labelStat->labelBat) > 4 * labelStat->labelBat->T->hash->mask) {
+                                       HASHdestroy(labelStat->labelBat);
+                                       BAThash(BATmirror(labelStat->labelBat), 
2*BATcount(labelStat->labelBat));
+                               }
+
+                               labelStat->labelBat = 
BUNappend(labelStat->labelBat, (ptr) &name, TRUE);
+                                               
+                               if(labelStat->numLabeladded == 
labelStat->numAllocation) 
+                               { 
+                                       labelStat->numAllocation += 
INIT_DISTINCT_LABEL; 
+                                       
+                                       _tmp = realloc(labelStat->lstCount, 
(labelStat->numAllocation * sizeof(int)));
+                               
+                                       if (!_tmp){
+                                               fprintf(stderr, "ERROR: 
Couldn't realloc memory!\n");
+                                       }
+                                       labelStat->lstCount = (int*)_tmp;
+                               }
+                               labelStat->lstCount[labelStat->numLabeladded] = 
1; 
+                               labelStat->numLabeladded++;
+                       }
+                       else{
+                               labelStat->lstCount[bun]++;
+                       }
+               }
+               else
+                       numDummy++;
+       }
+       
+       printf("Collect label stat for final table: Total number of distinct 
labels %d \n", labelStat->numLabeladded);
+       printf("Number of DUMMY freqCS: %d \n",numDummy);
+
+       //Build list of freqId corresponding to each label
+       labelStat->freqIdList = (int**) malloc(sizeof(int*) * 
labelStat->numLabeladded);
+       for (i =0; i < labelStat->numLabeladded; i++){
+               labelStat->freqIdList[i] = (int*)malloc(sizeof(int) * 
labelStat->lstCount[i]);
+               //reset the lstCount
+               labelStat->lstCount[i] = 0;
+       }
+       
+       for (i = 0; i  < numTables; i++){
+               name = cstablestat->lstcstable[i].tblname;
+               if (name != BUN_NONE){
+                       bun = BUNfnd(BATmirror(labelStat->labelBat),(ptr) 
&name);
+                       if (bun == BUN_NONE) {
+                               fprintf(stderr, "[Error] All the name should be 
stored already!\n");
+                       }
+                       else{
+                               tblIdx = labelStat->lstCount[bun];
+                               labelStat->freqIdList[bun][tblIdx] = i; 
+                               labelStat->lstCount[bun]++;
+                       }
+               }
+       }
+
+}
+
+#else /* USING_FINALTABLE = 0*/
+
+static
+void buildLabelStatForFinalMergeCS(LabelStat *labelStat, CSset *freqCSset, 
CSlabel *labels){
+       int     i; 
+       BUN     bun; 
+       int     *_tmp; 
+       int     numDummy = 0;
+       oid     name; 
+       int     freqIdx = -1;
+
+       //Preparation
+       for (i = 0; i  < freqCSset->numCSadded; i++){
+               if (freqCSset->items[i].parentFreqIdx != -1) continue; 
+
+               if ( labels[i].name != BUN_NONE){
+                       name = labels[i].name;
+                       bun = BUNfnd(BATmirror(labelStat->labelBat),(ptr) 
&name);
+                       if (bun == BUN_NONE) {
+                               //New string
+                               if (labelStat->labelBat->T->hash && 
BATcount(labelStat->labelBat) > 4 * labelStat->labelBat->T->hash->mask) {
+                                       HASHdestroy(labelStat->labelBat);
+                                       BAThash(BATmirror(labelStat->labelBat), 
2*BATcount(labelStat->labelBat));
+                               }
+
+                               labelStat->labelBat = 
BUNappend(labelStat->labelBat, (ptr) &name, TRUE);
+                                               
+                               if(labelStat->numLabeladded == 
labelStat->numAllocation) 
+                               { 
+                                       labelStat->numAllocation += 
INIT_DISTINCT_LABEL; 
+                                       
+                                       _tmp = realloc(labelStat->lstCount, 
(labelStat->numAllocation * sizeof(int)));
+                               
+                                       if (!_tmp){
+                                               fprintf(stderr, "ERROR: 
Couldn't realloc memory!\n");
+                                       }
+                                       labelStat->lstCount = (int*)_tmp;
+                               }
+                               labelStat->lstCount[labelStat->numLabeladded] = 
1; 
+                               labelStat->numLabeladded++;
+                       }
+                       else{
+                               labelStat->lstCount[bun]++;
+                       }
+               }
+               else
+                       numDummy++;
+       }
+       
+       printf("Collect label stat for final mergeCS: Total number of distinct 
labels %d \n", labelStat->numLabeladded);
+       printf("Number of DUMMY freqCS: %d \n",numDummy);
+
+       //Build list of freqId corresponding to each label
+       labelStat->freqIdList = (int**) malloc(sizeof(int*) * 
labelStat->numLabeladded);
+       for (i =0; i < labelStat->numLabeladded; i++){
+               labelStat->freqIdList[i] = (int*)malloc(sizeof(int) * 
labelStat->lstCount[i]);
+               //reset the lstCount
+               labelStat->lstCount[i] = 0;
+       }
+       
+
+       for (i = 0; i  < freqCSset->numCSadded; i++){
+               if (freqCSset->items[i].parentFreqIdx != -1) continue; 
+
+               name = labels[i].name;
+               if (name != BUN_NONE){
+                       bun = BUNfnd(BATmirror(labelStat->labelBat),(ptr) 
&name);
+                       if (bun == BUN_NONE) {
+                               fprintf(stderr, "[Error] All the name should be 
stored already!\n");
+                       }
+                       else{
+                               freqIdx = labelStat->lstCount[bun];
+                               labelStat->freqIdList[bun][freqIdx] = i; 
+                               labelStat->lstCount[bun]++;
+                       }
+               }
+       }
+
+}
+#endif
+
+#endif
+
 static
 void buildLabelStat(LabelStat *labelStat, CSlabel *labels, CSset *freqCSset, 
int k){
        int     i,j; 
@@ -3545,6 +3779,7 @@ void buildLabelStat(LabelStat *labelStat
        }
 
 }
+
 static 
 void freeLabelStat(LabelStat *labelStat){
        int i; 
@@ -5258,6 +5493,219 @@ str RDFassignCSId(int *ret, BAT *sbat, B
        return MAL_SUCCEED; 
 }
 
+#if DETECT_INCORRECT_TYPE_SUBJECT
+
+static 
+str RDFcheckWrongTypeSubject(BAT *sbat, BATiter si, BATiter pi, BATiter oi, 
CSset *freqCSset, int maxNumPwithDup, int numTables, int 
*mTblIdxFreqIdxMapping, LabelStat *labelStat, oid *subjCSMap, int 
*csFreqCSMapping){
+
+       BUN     p, q; 
+       oid     *sbt, *pbt, *obt; 
+       oid     curS;           /* current Subject oid */
+       oid     curP;           /* current Property oid */
+       int     numP;           /* Number of properties for current S */
+       int     numPwithDup = 0; 
+       oid*    buff;    
+
+       //Only keep the most specific ontology-based rdftype value 
+       int     maxNumOntology = 20;            
+       oid*    rdftypeOntologyValues = NULL; 
+       char*   rdftypeSelectedValues = NULL; //Store which value is selected
+       char*   rdftypeSpecificLevels = NULL; //Store specific level for each 
value
+       BUN*    rdftypeOntClassPos = NULL; //Position in the ontology class     
             
+
+       int     numTypeValues = 0;
+       #if EXTRAINFO_FROM_RDFTYPE
+       int     tmpMaxSpecificLevel = 0; 
+       int     tmpSpecificLevel = 0; 
+       BUN     tmpOntClassPos = BUN_NONE;  //index of the ontology class in 
the ontmetaBat
+       PropStat        *ontPropStat = NULL;
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to