Changeset: e7510d774b04 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=e7510d774b04
Modified Files:
monetdb5/extras/rdf/rdfschema.c
monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:
Count the percentage of ontology props used in each final relational table
diffs (271 lines):
diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -802,7 +802,6 @@ void initCSPropTypesForBasicFreqCS(CSPro
}
assert(id == numMergedCS);
-
//return csPropTypes;
}
#endif
@@ -3585,6 +3584,7 @@ void generatecsRelSum(CSrel csRel, int f
}
+#if USE_LABEL_FOR_MERGING
static
LabelStat* initLabelStat(void){
LabelStat *labelStat = (LabelStat*) malloc(sizeof(LabelStat));
@@ -3603,10 +3603,12 @@ LabelStat* initLabelStat(void){
return labelStat;
}
+#endif
/*
*
* */
+#if USE_LABEL_FOR_MERGING
#if USE_ALTERNATIVE_NAME
static
oid getMostSuitableName(CSlabel *labels, int freqIdx, int candIdx){
@@ -3641,6 +3643,7 @@ oid getMostSuitableName(CSlabel *labels,
}
#endif
+#endif
#if DETECT_INCORRECT_TYPE_SUBJECT
@@ -3801,6 +3804,7 @@ void buildLabelStatForFinalMergeCS(Label
#endif
+#if USE_LABEL_FOR_MERGING
static
void buildLabelStat(LabelStat *labelStat, CSlabel *labels, CSset *freqCSset,
int k){
int i,j;
@@ -3887,7 +3891,9 @@ void buildLabelStat(LabelStat *labelStat
}
}
-
+#endif
+
+#if USE_LABEL_FOR_MERGING
static
void freeLabelStat(LabelStat *labelStat){
int i;
@@ -3901,6 +3907,7 @@ void freeLabelStat(LabelStat *labelStat)
BBPreclaim(labelStat->labelBat);
free(labelStat);
}
+#endif
static
void doMerge(CSset *freqCSset, int ruleNum, int freqId1, int freqId2, oid
*mergecsId, CSlabel** labels, oid** ontmetadata, int ontmetadataCount, oid
name, int isType, int isOntology, int isFK){
@@ -3954,6 +3961,7 @@ void doMerge(CSset *freqCSset, int ruleN
}
+#if USE_LABEL_FOR_MERGING
static
str mergeMaxFreqCSByS1(CSset *freqCSset, CSlabel** labels, oid *mergecsId,
oid** ontmetadata, int ontmetadataCount,bat *mapbatid){
int i, j;
@@ -4188,6 +4196,7 @@ str mergeMaxFreqCSByS1(CSset *freqCSset,
return MAL_SUCCEED;
}
+#endif
static
void mergeMaxFreqCSByS5(CSrel *csrelMergeFreqSet, CSset *freqCSset, CSlabel**
labels, oid* mergeCSFreqCSMap, int curNumMergeCS, oid *mergecsId, oid**
ontmetadata, int ontmetadataCount){
@@ -4319,7 +4328,7 @@ void mergeMaxFreqCSByS5(CSrel *csrelMerg
}
-
+#if USE_LABEL_FOR_MERGING
static
char isSemanticSimilar(int freqId1, int freqId2, CSlabel* labels,
OntoUsageNode *tree, int numOrigFreqCS, oid *ancestor, BAT *ontmetaBat,
OntClass *ontclassSet){ /*Rule S1 S2 S3*/
int i, j;
@@ -4433,6 +4442,7 @@ char isSemanticSimilar(int freqId1, int
return 0;
}
+#endif
static
void initTFIDFInfos(TFIDFInfo *tfidfInfos, int curNumMergeCS, oid*
mergeCSFreqCSMap, CSset *freqCSset, PropStat *propStat){
@@ -4476,6 +4486,7 @@ void freeTFIDFInfo(TFIDFInfo *tfidfInfos
free(tfidfInfos);
}
+#if USE_LABEL_FOR_MERGING
static
void mergeCSByS2(CSset *freqCSset, CSlabel** labels, oid* mergeCSFreqCSMap,
int curNumMergeCS, oid *mergecsId,OntoUsageNode *ontoUsageTree, oid
**ontmetadata, int ontmetadataCount, BAT *ontmetaBat, OntClass *ontclassSet){
int i, j;
@@ -4517,6 +4528,7 @@ void mergeCSByS2(CSset *freqCSset, CSlab
}
}
+#endif
static
void mergeCSByS4(CSset *freqCSset, CSlabel** labels, oid* mergeCSFreqCSMap,
int curNumMergeCS, oid *mergecsId,oid **ontmetadata, int ontmetadataCount){
@@ -5330,6 +5342,39 @@ float similarityScoreWithOntologyClass(o
return ((float) sumXY);
}
+
+#if COUNT_PERCENTAGE_ONTO_PROP_USED
+
+static
+void countNumOverlapProp(oid* arr1, oid* arr2, int m, int n,
+ int *numOverlap){
+
+ int i = 0, j = 0;
+ int numCommon = 0;
+
+ i = 0;
+ j = 0;
+ while( i < n && j < m )
+ {
+ if( arr1[j] < arr2[i] ){
+ j++;
+
+ }
+ else if( arr1[j] == arr2[i] )
+ {
+ j++;
+ i++;
+ numCommon++;
+
+ }
+ else if( arr1[j] > arr2[i] )
+ i++;
+ }
+
+ *numOverlap = numCommon;
+
+}
+#endif
static
void getBestRdfTypeValue(oid *buff, int numP, oid *rdftypeOntologyValues, char
*rdftypeSelectedValues, char *rdftypeSpecificLevels, BUN *rdftypeOntClassPos,
int *numTypeValues, int maxSpecificLevel, TFIDFInfo *tfidfInfos){
@@ -8478,6 +8523,13 @@ str printFinalStructure(CStableStat* cst
char* schema = "rdf";
BATiter mapi;
BAT *mbat = NULL;
+ #if COUNT_PERCENTAGE_ONTO_PROP_USED
+ int numOntologyName = 0;
+ int numOntologyProp = 0;
+ int numOntologyPropUsed = 0;
+ int tmpNumOverlap = 0;
+ BUN tmpPos = BUN_NONE;
+ #endif
printf("Summarizing the final table information \n");
// allocate memory space for cstablestat
@@ -8499,14 +8551,31 @@ str printFinalStructure(CStableStat* cst
for (i = 0; i < numTables; i++){
tmpNumDefaultCol = csPropTypes[i].numProp -
csPropTypes[i].numInfreqProp;
+ assert(tmpNumDefaultCol == cstablestat->lstcstable[i].numCol);
if (cstablestat->lstcstable[i].tblname != BUN_NONE){
str subjStrShort = NULL;
//takeOid(cstablestat->lstcstable[i].tblname, &subjStr);
getStringName(cstablestat->lstcstable[i].tblname,
&subjStr, mapi, mbat, 1);
getPropNameShort(&subjStrShort, subjStr);
-
+ #if COUNT_PERCENTAGE_ONTO_PROP_USED
+ tmpNumOverlap = 0;
+ tmpPos = BUNfnd(BATmirror(ontmetaBat),
&cstablestat->lstcstable[i].tblname);
+ if (tmpPos != BUN_NONE){
+ if (ontclassSet[tmpPos].numProp != 0){
//otherwise, we do not have the information for this ontology class
+
countNumOverlapProp(ontclassSet[tmpPos].lstProp,
cstablestat->lstcstable[i].lstProp ,
+
ontclassSet[tmpPos].numProp,tmpNumDefaultCol, &tmpNumOverlap);
+
+ numOntologyPropUsed += tmpNumOverlap;
+ numOntologyName += 1;
+ numOntologyProp +=
ontclassSet[tmpPos].numProp;
+ }
+
+ }
+ fprintf(fout, "Table %d (Name: %s | NumCols: %d | (Num
onto prop. used: %d)\n", i, subjStrShort, tmpNumDefaultCol,tmpNumOverlap);
+ #else
fprintf(fout, "Table %d (Name: %s | NumCols: %d)\n", i,
subjStrShort, tmpNumDefaultCol);
+ #endif
GDKfree(subjStrShort);
GDKfree(subjStr);
@@ -8541,6 +8610,14 @@ str printFinalStructure(CStableStat* cst
}
printf(" Number of no-name table: %d | (Total:
%d)\n",numNoNameTable,numTables);
+ #if COUNT_PERCENTAGE_ONTO_PROP_USED
+ if (numOntologyProp != 0){
+ printf(" Percentage of ontology prop. used is %f (in %d
ontology names used) \n", (float)
numOntologyPropUsed/numOntologyProp,numOntologyName);
+ }
+ else{
+ printf(" Percentage of ontology prop. used: There is no
ontology attributes for this dataset \n");
+ }
+ #endif
fclose(fout);
@@ -9090,6 +9167,7 @@ RDFextractCSwithTypes(int *ret, bat *sba
curNumMergeCS = countNumberMergeCS(freqCSset);
printf("Before using rules: Number of freqCS is: %d \n",curNumMergeCS);
+#if USE_LABEL_FOR_MERGING
/* ---------- S1 ------- */
mergecsId = *maxCSoid + 1;
@@ -9109,6 +9187,7 @@ RDFextractCSwithTypes(int *ret, bat *sba
computeMetricsQ(freqCSset);
#endif
tmpLastT = curT;
+#endif
/* ---------- S3 ------- */
mergeCSFreqCSMap = (oid*) malloc(sizeof(oid) * curNumMergeCS);
@@ -9159,6 +9238,7 @@ RDFextractCSwithTypes(int *ret, bat *sba
tmpLastT = curT;
+#if USE_LABEL_FOR_MERGING
//S2: Common ancestor
free(mergeCSFreqCSMap);
mergeCSFreqCSMap = (oid*) malloc(sizeof(oid) * curNumMergeCS);
@@ -9179,6 +9259,7 @@ RDFextractCSwithTypes(int *ret, bat *sba
#endif
tmpLastT = curT;
+#endif
//S4: TF/IDF similarity
diff --git a/monetdb5/extras/rdf/rdfschema.h b/monetdb5/extras/rdf/rdfschema.h
--- a/monetdb5/extras/rdf/rdfschema.h
+++ b/monetdb5/extras/rdf/rdfschema.h
@@ -152,7 +152,7 @@ typedef struct PropStat {
#define STORE_PERFORMANCE_METRIC_INFO 1
-#define NO_OUTPUTFILE 1 /*Do not write the output to any file */
+#define NO_OUTPUTFILE 0 /*Do not write the output to any file */
extern int totalNumberOfTriples;
extern int acceptableTableSize;
@@ -272,6 +272,8 @@ typedef struct SubCSSet{
#define REMOVE_LOTSOFNULL_SUBJECT 1
#define LOTSOFNULL_SUBJECT_THRESHOLD 0.1
+#define COUNT_PERCENTAGE_ONTO_PROP_USED 1 //Calculate the
percentage of properties of ontology class
+ //used in final schema
#define DETECT_INCORRECT_TYPE_SUBJECT 0 //Detect subjects that are
assigned wrong type. (Default value 0)
#define USING_FINALTABLE 0 //Using the final table for
collecting label stat or using set of
//final merged CS. The set of
merged CS will be larged as it may
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list