Changeset: 2f50721c1ee4 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=2f50721c1ee4
Modified Files:
        monetdb5/extras/rdf/rdfschema.c
        monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:

Count number of types per prop for basic freqCS


diffs (206 lines):

diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -198,6 +198,8 @@ char isCSTable(CS item, oid name){
        if (item.type == DIMENSIONCS) return 1; 
 
        #if REMOVE_SMALL_TABLE
+       if (item.support > acceptableTableSize) return 1;
+
        if (item.coverage < MINIMUM_TABLE_SIZE) return 0;
        
        //More strict with table which does not have name
@@ -745,6 +747,66 @@ void initCSPropTypes(CSPropTypes* csProp
        //return csPropTypes;
 }
 
+#if COUNT_NUMTYPES_PERPROP
+
+static 
+void initCSPropTypesForBasicFreqCS(CSPropTypes* csPropTypes, CSset* freqCSset, 
int numMergedCS){
+       int numFreqCS = freqCSset->numCSadded;
+       int i, j, k ;
+       int id; 
+       
+       id = 0; 
+       for (i = 0; i < numFreqCS; i++){
+               csPropTypes[id].freqCSId = i; 
+               csPropTypes[id].numProp = freqCSset->items[i].numProp;
+               csPropTypes[id].numInfreqProp = 0; 
+               csPropTypes[id].numNonDefTypes = 0;
+               csPropTypes[id].lstPropTypes = (PropTypes*) 
GDKmalloc(sizeof(PropTypes) * csPropTypes[id].numProp);
+               for (j = 0; j < csPropTypes[id].numProp; j++){
+                       csPropTypes[id].lstPropTypes[j].prop = 
freqCSset->items[i].lstProp[j]; 
+                       #if STAT_ANALYZE
+                       csPropTypes[id].lstPropTypes[j].numNull = 0;
+                       csPropTypes[id].lstPropTypes[j].numMVType = 0;
+                       csPropTypes[id].lstPropTypes[j].numSingleType = 0;      
        
+                       #endif
+                       csPropTypes[id].lstPropTypes[j].propFreq = 0; 
+                       csPropTypes[id].lstPropTypes[j].propCover = 0; 
+                       csPropTypes[id].lstPropTypes[j].numType = MULTIVALUES + 
1;
+                       csPropTypes[id].lstPropTypes[j].defaultType = STRING; 
+                       csPropTypes[id].lstPropTypes[j].isMVProp = 0; 
+                       csPropTypes[id].lstPropTypes[j].isPKProp = 0; 
+                       csPropTypes[id].lstPropTypes[j].numMvTypes = 0; 
+                       csPropTypes[id].lstPropTypes[j].defColIdx = -1; 
+                       csPropTypes[id].lstPropTypes[j].isFKProp = 0;
+                       csPropTypes[id].lstPropTypes[j].refTblId = -1; 
+                       csPropTypes[id].lstPropTypes[j].refTblSupport = 0;
+                       csPropTypes[id].lstPropTypes[j].numReferring = 0;
+                       csPropTypes[id].lstPropTypes[j].numDisRefValues = 0;
+                       csPropTypes[id].lstPropTypes[j].isDirtyFKProp = 0; 
+                       csPropTypes[id].lstPropTypes[j].lstTypes = 
(char*)GDKmalloc(sizeof(char) * csPropTypes[id].lstPropTypes[j].numType);
+                       csPropTypes[id].lstPropTypes[j].lstFreq = 
(int*)GDKmalloc(sizeof(int) * csPropTypes[id].lstPropTypes[j].numType);
+                       csPropTypes[id].lstPropTypes[j].lstFreqWithMV = 
(int*)GDKmalloc(sizeof(int) * csPropTypes[id].lstPropTypes[j].numType);
+                       csPropTypes[id].lstPropTypes[j].colIdxes = 
(int*)GDKmalloc(sizeof(int) * csPropTypes[id].lstPropTypes[j].numType);
+                       csPropTypes[id].lstPropTypes[j].TableTypes = 
(char*)GDKmalloc(sizeof(char) * csPropTypes[id].lstPropTypes[j].numType);
+
+                       for (k = 0; k < 
csPropTypes[id].lstPropTypes[j].numType; k++){
+                               csPropTypes[id].lstPropTypes[j].lstFreq[k] = 0; 
+                               
csPropTypes[id].lstPropTypes[j].lstFreqWithMV[k] = 0; 
+                               csPropTypes[id].lstPropTypes[j].TableTypes[k] = 
0; 
+                               csPropTypes[id].lstPropTypes[j].colIdxes[k] = 
-1; 
+                       }
+
+               }
+
+               id++;
+       }
+
+       assert(id == numMergedCS);
+
+       //return csPropTypes;
+}
+#endif
+
 static 
 char isMultiValueCol(PropTypes pt){
        double tmpRatio;
@@ -5071,7 +5133,8 @@ oid       **ontmetadata = NULL;
 int    ontmetadataCount = 0;
 BAT    *ontmetaBat = NULL;
 OntClass  *ontclassSet = NULL; 
-
+int    totalNumberOfTriples = 0;
+int    acceptableTableSize = 0;
 
 static 
 BAT* buildTypeOidBat(void){
@@ -5139,6 +5202,7 @@ int getOntologySpecificLevel(oid valueOi
                return ontclassSet[*ontClassPos].hierDepth;
 }
 
+#if DETECT_INCORRECT_TYPE_SUBJECT
 static
 char isSupSuperOntology(oid value1, oid value2){
        BUN ontclasspos1 = BUN_NONE;
@@ -5165,6 +5229,7 @@ char isSupSuperOntology(oid value1, oid 
 
        return 0;
 }
+#endif
        
 static
 PropStat* getPropStatisticsByOntologyClass(int numClass, OntClass 
*ontClassSet){
@@ -6250,6 +6315,41 @@ str RDFExtractCSPropTypes(int *ret, BAT 
        return MAL_SUCCEED; 
 }
 
+static 
+void printNumTypePerProp(CSPropTypes* csPropTypes, int numCS, CSset 
*freqCSset){
+       
+       int     i,j,k; 
+       CS      cs;
+       int     tmpNumType = 0; 
+       int     tmpSumNumType = 0;      
+       int     totalNumProp = 0; 
+       int     totalNumTypes = 0; 
+
+       FILE    *fout;
+       fout = fopen("csPropTypeBasicCS.txt","wt");
+       fprintf(fout, "#FreqCSId        #NumProp #NumTypes #AvgNumType/Prop 
\n");
+       for (i = 0; i < numCS; i++){
+               cs = freqCSset->items[i];
+               tmpSumNumType = 0; 
+               for (j = 0; j < cs.numProp; j++){
+                               tmpNumType = 0;
+                               for (k = 0; k < MULTIVALUES; k++){
+                                       if 
(csPropTypes[i].lstPropTypes[j].lstFreqWithMV[k] > 0){
+                                               tmpNumType++;
+                                       }
+                               }
+                               tmpSumNumType += tmpNumType;
+               }
+
+               fprintf(fout, "%d       %d      %d      %.2f 
\n",i,cs.numProp,tmpSumNumType, (float) tmpSumNumType/cs.numProp);
+               totalNumProp += cs.numProp;
+               totalNumTypes += tmpSumNumType;
+       }
+
+       printf("Average number of types per prop in freqCS: %f \n", 
(float)totalNumTypes/totalNumProp);
+
+       fclose(fout); 
+}
 
 #if NO_OUTPUTFILE == 0 
 static 
@@ -8802,8 +8902,9 @@ RDFextractCSwithTypes(int *ret, bat *sba
        *subjCSMap = (oid *) malloc (sizeof(oid) * ((*maxSoid) + 1)); 
        initArray(*subjCSMap, (*maxSoid) + 1, BUN_NONE);
 
-
-       
+       totalNumberOfTriples =  BATcount(sbat); 
+       acceptableTableSize = totalNumberOfTriples / 2000;
+       printf("Acceptable table size = %d \n", acceptableTableSize);
        
        tmpLastT = clock();
 
@@ -8885,6 +8986,20 @@ RDFextractCSwithTypes(int *ret, bat *sba
        
        /*get the statistic */
        //getTopFreqCSs(csMap,*freqThreshold);
+       #if COUNT_NUMTYPES_PERPROP
+       {
+       
+       /* Get possible types of each property in a table (i.e., mergedCS) */
+       CSPropTypes *csPropTypes = (CSPropTypes*)GDKmalloc(sizeof(CSPropTypes) 
* (freqCSset->numCSadded)); 
+       initCSPropTypesForBasicFreqCS(csPropTypes, freqCSset, 
freqCSset->numCSadded);
+       
+       printf("Extract CSPropTypes from basic CS's \n");
+       RDFExtractCSPropTypes(ret, sbat, si, pi, oi, *subjCSMap, 
csIdFreqIdxMap, csPropTypes, *maxNumPwithDup);
+       printNumTypePerProp(csPropTypes, freqCSset->numCSadded, freqCSset);
+
+       freeCSPropTypes(csPropTypes, freqCSset->numCSadded);
+       }
+       #endif
 
        // Create label per freqCS
 
diff --git a/monetdb5/extras/rdf/rdfschema.h b/monetdb5/extras/rdf/rdfschema.h
--- a/monetdb5/extras/rdf/rdfschema.h
+++ b/monetdb5/extras/rdf/rdfschema.h
@@ -152,8 +152,12 @@ typedef struct PropStat {
 
 #define STORE_PERFORMANCE_METRIC_INFO  1
 
-#define NO_OUTPUTFILE  0               /*Do not write the output to any file */
+#define NO_OUTPUTFILE  1               /*Do not write the output to any file */
 
+extern int totalNumberOfTriples; 
+extern int acceptableTableSize;
+
+#define        COUNT_NUMTYPES_PERPROP 1
 
 typedef struct CS
 {
@@ -268,7 +272,7 @@ typedef struct SubCSSet{
 #define REMOVE_LOTSOFNULL_SUBJECT      1
 #define        LOTSOFNULL_SUBJECT_THRESHOLD    0.1
 
-#define DETECT_INCORRECT_TYPE_SUBJECT  1       //Detect subjects that are 
assigned wrong type. (Default value 0)
+#define DETECT_INCORRECT_TYPE_SUBJECT  0       //Detect subjects that are 
assigned wrong type. (Default value 0)
 #define USING_FINALTABLE               0       //Using the final table for 
collecting label stat or using set of 
                                                //final merged CS. The set of 
merged CS will be larged as it may
                                                //contain small table
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to