Changeset: 501766ea68a7 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=501766ea68a7
Modified Files:
        monetdb5/extras/rdf/rdflabels.c
        monetdb5/extras/rdf/rdflabels.h
        monetdb5/extras/rdf/rdfschema.c
        monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:

Make the rule assigning table name based on FK more strict.

- Check whether the number of references to that CS is greater than a certain 
percentage of CS's frequency.

Identify good non-ontology type value:
- If the type value appears in more than e.g., 95% of CS's instances.
- Choose this type instead of similar ontology class --> fix problem with BSBM


diffs (truncated from 390 to 300 lines):

diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -495,7 +495,7 @@ void convertToSQL(CSset *freqCSset, Rela
        for (i = 0; i < freqCSset->numCSadded; ++i) {
                str labelStr, tmpStr;
 
-               if (!isCSTable(freqCSset->items[i])) continue; // ignore
+               if (!isCSTable(freqCSset->items[i],labels[i].name)) continue; 
// ignore
 
                if (labels[i].name == BUN_NONE) {
                        fprintf(fout, "CREATE TABLE %s_"BUNFMT" (\nsubject 
VARCHAR(10) PRIMARY KEY,\n", "DUMMY", freqCSset->items[i].csId); // TODO 
underscores?
@@ -568,7 +568,7 @@ void convertToSQL(CSset *freqCSset, Rela
 
        // add foreign key columns and add foreign keys
        for (i = 0; i < freqCSset->numCSadded; ++i) {
-               if (!isCSTable(freqCSset->items[i])) continue; // ignore
+               if (!isCSTable(freqCSset->items[i],labels[i].name)) continue; 
// ignore
 
                for (j = 0; j < labels[i].numProp; ++j) {
                        str propStr, tmpStr2;
@@ -700,7 +700,7 @@ void createSQLMetadata(CSset* freqCSset,
        for (i = 0; i < freqCSset->numCSadded; ++i) {
                CS cs = (CS) freqCSset->items[i];
 
-               if (!isCSTable(cs)) continue; // ignore
+               if (!isCSTable(cs, labels[i].name)) continue; // ignore
                if (csRelBetweenMergeFreqSet[i].numRef == 0) continue; 
 
                for (j = 0; j < cs.numProp; ++j) { // propNo in CS order
@@ -717,7 +717,7 @@ void createSQLMetadata(CSset* freqCSset,
                                        int toId = 
csRelBetweenMergeFreqSet[i].lstRefFreqIdx[k];
                                        if (toId == -1) continue; // ignore
                                        if (i == toId) continue; // ignore self 
references
-                                       if (!isCSTable(freqCSset->items[toId])) 
continue; 
+                                       if (!isCSTable(freqCSset->items[toId], 
labels[toId].name)) continue; 
                                        if ((int) (100.0 * 
csRelBetweenMergeFreqSet[i].lstCnt[k] / sum + 0.5) < FK_FREQ_THRESHOLD) 
continue; // foreign key is not frequent enough
                                        tblfrom = mfreqIdxTblIdxMapping[i]; 
                                        tblto = mfreqIdxTblIdxMapping[toId];
@@ -741,7 +741,7 @@ void createSQLMetadata(CSset* freqCSset,
        // print id -> table name
        fout = fopen("tableIdFreq.csv", "wt");
        for (i = 0; i < freqCSset->numCSadded; ++i) {
-               if (!isCSTable(freqCSset->items[i])) continue; // ignore
+               if (!isCSTable(freqCSset->items[i], labels[i].name)) continue; 
// ignore
 
                if (labels[i].name == BUN_NONE) {
                        fprintf(fout, "%d,\"%s_"BUNFMT"\",%d\n", i, "DUMMY", 
freqCSset->items[i].csId, freqCSset->items[i].support); // TODO underscores?
@@ -806,7 +806,7 @@ void printTxt(CSset* freqCSset, CSlabel*
                str labelStrShort = NULL;
 #endif
 
-               if (!isCSTable(freqCSset->items[i])) continue; // ignore
+               if (!isCSTable(freqCSset->items[i], labels[i].name)) continue; 
// ignore
 
                if (labels[i].name == BUN_NONE) {
                        fprintf(fout, "%s (CS "BUNFMT"): ", "DUMMY", 
freqCSset->items[i].csId);
@@ -1848,7 +1848,7 @@ void printUML2(CSset *freqCSset, CSlabel
        // find biggest and smallest table
        for (i = 0; i < freqCSset->numCSadded; ++i) {
                CS cs = (CS) freqCSset->items[i];
-               if (!isCSTable(cs)) continue; // ignore
+               if (!isCSTable(cs,labels[i].name)) continue; // ignore
 
                // first values
                if (smallest == -1) smallest = i;
@@ -1868,7 +1868,7 @@ void printUML2(CSset *freqCSset, CSlabel
 #endif
 
                CS cs = (CS) freqCSset->items[i];
-               if (!isCSTable(cs)) continue; // ignore
+               if (!isCSTable(cs, labels[i].name)) continue; // ignore
 
                // print header
                width = (int) ((300 + 300 * 
(log10(freqCSset->items[i].coverage) - 
log10(freqCSset->items[smallest].coverage)) / 
(log10(freqCSset->items[biggest].coverage) - 
log10(freqCSset->items[smallest].coverage))) + 0.5); // width between 300 and 
600 px, using logarithm
@@ -1951,7 +1951,7 @@ void printUML2(CSset *freqCSset, CSlabel
 
        for (i = 0; i < freqCSset->numCSadded; ++i) {
                CS cs = (CS) freqCSset->items[i];
-               if (!isCSTable(cs)) continue; // ignore
+               if (!isCSTable(cs, labels[i].name)) continue; // ignore
 
                for (j = 0; j < cs.numProp; ++j) {
                        str     tmpStr;
@@ -2123,7 +2123,7 @@ void removeDuplicatedCandidates(CSlabel 
  * 
  */
 static
-void getTableName(CSlabel* label, int csIdx,  int typeAttributesCount, 
TypeAttributesFreq*** typeAttributesHistogram, int** 
typeAttributesHistogramCount, TypeStat* typeStat, int typeStatCount, oid** 
result,int** resultMatchedProp, int* resultCount, IncidentFKs* links, oid** 
ontmetadata, int ontmetadataCount, BAT *ontmetaBat, OntClass *ontclassSet) {
+void getTableName(CSlabel* label, CSset* freqCSset, int csIdx,  int 
typeAttributesCount, TypeAttributesFreq*** typeAttributesHistogram, int** 
typeAttributesHistogramCount, TypeStat* typeStat, int typeStatCount, oid** 
result,int** resultMatchedProp, int* resultCount, IncidentFKs* links, oid** 
ontmetadata, int ontmetadataCount, BAT *ontmetaBat, OntClass *ontclassSet) {
        int             i, j;
        oid             *tmpList;
        int             tmpListCount;
@@ -2346,29 +2346,34 @@ void getTableName(CSlabel* label, int cs
        // --- FK ---
        // add top3 fk values to list of candidates
        if (links[csIdx].num > 0) {
-               label->candidatesFK = MIN(3, links[csIdx].num);
-               label->candidates = GDKrealloc(label->candidates, sizeof(oid) * 
(label->candidatesCount + MIN(3, links[csIdx].num)));
-               if (!label->candidates) fprintf(stderr, "ERROR: Couldn't 
realloc memory!\n");
-               for (i = 0; i < MIN(3, links[csIdx].num); ++i) {
-                       label->candidates[label->candidatesCount + i] = 
links[csIdx].fks[0].prop;
+               //Only add the FK name, if its number of references is large 
enought
+               if ((links[csIdx].fks[0].freq * 100) > (FK_MIN_REFER_PERCENTAGE 
* freqCSset->items[csIdx].support)){
+                       label->candidatesFK = MIN(3, links[csIdx].num);
+                       label->candidates = GDKrealloc(label->candidates, 
sizeof(oid) * (label->candidatesCount + MIN(3, links[csIdx].num)));
+                       if (!label->candidates) fprintf(stderr, "ERROR: 
Couldn't realloc memory!\n");
+                       for (i = 0; i < MIN(3, links[csIdx].num); ++i) {
+                               label->candidates[label->candidatesCount + i] = 
links[csIdx].fks[0].prop;
+                       }
+                       label->candidatesCount += MIN(3, links[csIdx].num);
                }
-               label->candidatesCount += MIN(3, links[csIdx].num);
        }
 
        if (!nameFound) {
                // incident foreign keys --> use the one with the most 
occurances (num and freq)
                if (links[csIdx].num > 0) {
-                       label->name = links[csIdx].fks[0].prop; // sorted
-                       nameFound = 1;
-
-                       #if INFO_WHERE_NAME_FROM
-                       label->isFK = 1; 
-                       #endif
-                       
-                       #if INFO_NAME_FREQUENCY
-                       label->nameFreq = links[csIdx].fks[0].freq;
-                       label->ontologySimScore = 0.0;
-                       #endif
+                       if ((links[csIdx].fks[0].freq * 100) > 
(FK_MIN_REFER_PERCENTAGE * freqCSset->items[csIdx].support)){
+                               label->name = links[csIdx].fks[0].prop; // 
sorted
+                               nameFound = 1;
+
+                               #if INFO_WHERE_NAME_FROM
+                               label->isFK = 1; 
+                               #endif
+                               
+                               #if INFO_NAME_FREQUENCY
+                               label->nameFreq = links[csIdx].fks[0].freq;
+                               label->ontologySimScore = 0.0;
+                               #endif
+                       }
                }
        }
        
@@ -2467,7 +2472,7 @@ void getAllLabels(CSlabel* labels, CSset
                CS cs = (CS) freqCSset->items[i];
 
                // get table name
-               getTableName(&labels[i], i,  typeAttributesCount, 
typeAttributesHistogram, typeAttributesHistogramCount, typeStat, typeStatCount, 
result, resultMatchedProp, resultCount, links, ontmetadata, ontmetadataCount, 
ontmetaBat, ontclassSet);
+               getTableName(&labels[i], freqCSset, i,  typeAttributesCount, 
typeAttributesHistogram, typeAttributesHistogramCount, typeStat, typeStatCount, 
result, resultMatchedProp, resultCount, links, ontmetadata, ontmetadataCount, 
ontmetaBat, ontclassSet);
 
                // copy attribute oids (names)
                labels[i].numProp = cs.numProp;
diff --git a/monetdb5/extras/rdf/rdflabels.h b/monetdb5/extras/rdf/rdflabels.h
--- a/monetdb5/extras/rdf/rdflabels.h
+++ b/monetdb5/extras/rdf/rdflabels.h
@@ -92,6 +92,7 @@ enum {
 } RULE; 
 
 #define FK_FREQ_THRESHOLD 25           // X % of the targeted subjects have to 
be in this table
+#define FK_MIN_REFER_PERCENTAGE 25     // To be consider as the name of a CS, 
the FK have to point to at least FK_MIN_REFER_PERCENTAGE of all CS's instances 
 #define TYPE_FREQ_THRESHOLD 80         // X % of the type values have to be 
this value
 #define GOOD_TYPE_FREQ_THRESHOLD 95    // If a type appears really frequent in 
that CS, it should be choosen
 //#define ONTOLOGY_FREQ_THRESHOLD 0.4  // similarity threshold for tfidf 
simularity for ontology classes
diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -192,14 +192,19 @@ char getStringName(oid objOid, str *objS
 }
 
 
-char isCSTable(CS item){
+char isCSTable(CS item, oid name){
        if (item.parentFreqIdx != -1) return 0; 
 
        if (item.type == DIMENSIONCS) return 1; 
 
        #if REMOVE_SMALL_TABLE
        if (item.coverage < MINIMUM_TABLE_SIZE) return 0;
-       #endif
+       
+       //More strict with table which does not have name
+       if ((name == BUN_NONE) && item.support < MINIMUM_TABLE_SIZE) return 0; 
+       #endif  
+       
+       
 
        return 1; 
 }
@@ -682,14 +687,14 @@ void printSubCSInformation(SubCSSet *sub
  * 
  * */
 static 
-void initCSPropTypes(CSPropTypes* csPropTypes, CSset* freqCSset, int 
numMergedCS){
+void initCSPropTypes(CSPropTypes* csPropTypes, CSset* freqCSset, int 
numMergedCS, CSlabel *labels){
        int numFreqCS = freqCSset->numCSadded;
        int i, j, k ;
        int id; 
        
        id = 0; 
        for (i = 0; i < numFreqCS; i++){
-               if ( isCSTable(freqCSset->items[i])  ){   // Only use the 
maximum or merge CS           
+               if ( isCSTable(freqCSset->items[i], labels[i].name)){   // Only 
use the maximum or merge CS             
                        csPropTypes[id].freqCSId = i; 
                        csPropTypes[id].numProp = freqCSset->items[i].numProp;
                        csPropTypes[id].numInfreqProp = 0; 
@@ -2046,12 +2051,12 @@ str printMergedFreqCSSet(CSset *freqCSse
                                if (cs.subject != BUN_NONE){
                                        takeOid(cs.subject, &subStr);
                                        if (labels[i].name == BUN_NONE) {
-                                               fprintf(fout,"CS " BUNFMT " - 
FreqId %d - Name: %s  (Freq: %d) | Subject: %s  | FreqParentIdx %d \n", 
cs.csId, i, "DUMMY", freq, subStr, cs.parentFreqIdx);
+                                               fprintf(fout,"CS " BUNFMT " - 
FreqId %d - Name: %s  (Freq: %d | Coverage: %d) | Subject: %s  | FreqParentIdx 
%d \n", cs.csId, i, "DUMMY", freq, cs.coverage, subStr, cs.parentFreqIdx);
                                        } else {
                                                str labelStr;
                                                //takeOid(labels[i].name, 
&labelStr);
                                                getStringName(labels[i].name, 
&labelStr, mapi, mapbat, 1);
-                                               fprintf(fout,"CS " BUNFMT " - 
FreqId %d - Name: %s  (Freq: %d) (NameFreq: %d --> %.2f percent) | Subject: %s  
| FreqParentIdx %d \n", cs.csId, i, labelStr, freq, labels[i].nameFreq, (float) 
labels[i].nameFreq/freq * 100, subStr, cs.parentFreqIdx);
+                                               fprintf(fout,"CS " BUNFMT " - 
FreqId %d - Name: %s  (Freq: %d | Coverage: %d) (NameFreq: %d --> %.2f percent) 
| Subject: %s  | FreqParentIdx %d \n", cs.csId, i, labelStr, freq, 
cs.coverage,labels[i].nameFreq, (float) labels[i].nameFreq/freq * 100, subStr, 
cs.parentFreqIdx);
                                                GDKfree(labelStr); 
                                        }
 
@@ -2059,12 +2064,12 @@ str printMergedFreqCSSet(CSset *freqCSse
                                }
                                else{
                                        if (labels[i].name == BUN_NONE) {
-                                               fprintf(fout,"CS " BUNFMT " - 
FreqId %d - Name: %s  (Freq: %d) | FreqParentIdx %d \n", cs.csId, i, "DUMMY", 
freq, cs.parentFreqIdx);
+                                               fprintf(fout,"CS " BUNFMT " - 
FreqId %d - Name: %s  (Freq: %d | Coverage: %d) | FreqParentIdx %d \n", 
cs.csId, i, "DUMMY", freq, cs.coverage,cs.parentFreqIdx);
                                        } else {
                                                str labelStr;
                                                //takeOid(labels[i].name, 
&labelStr);
                                                getStringName(labels[i].name, 
&labelStr, mapi, mapbat, 1);
-                                               fprintf(fout,"CS " BUNFMT " - 
FreqId %d - Name: %s  (Freq: %d) (NameFreq: %d --> %.2f percent) | 
FreqParentIdx %d \n", cs.csId, i, labelStr, freq, labels[i].nameFreq, (float) 
labels[i].nameFreq/freq * 100, cs.parentFreqIdx);
+                                               fprintf(fout,"CS " BUNFMT " - 
FreqId %d - Name: %s  (Freq: %d | Coverage: %d) (NameFreq: %d --> %.2f percent) 
| FreqParentIdx %d \n", cs.csId, i, labelStr, freq, 
cs.coverage,labels[i].nameFreq, (float) labels[i].nameFreq/freq * 100, 
cs.parentFreqIdx);
                                                GDKfree(labelStr);
                                        }
                                }
@@ -2072,12 +2077,12 @@ str printMergedFreqCSSet(CSset *freqCSse
                        else {
 
                                if (labels[i].name == BUN_NONE) {
-                                       fprintf(fout,"CS " BUNFMT " - FreqId %d 
- Name: %s  (Freq: %d) | Subject: <Not available>  | FreqParentIdx %d \n", 
cs.csId, i, "DUMMY", freq, cs.parentFreqIdx);
+                                       fprintf(fout,"CS " BUNFMT " - FreqId %d 
- Name: %s  (Freq: %d | Coverage: %d) | Subject: <Not available>  | 
FreqParentIdx %d \n", cs.csId, i, "DUMMY", freq, cs.coverage,cs.parentFreqIdx);
                                } else {
                                        str labelStr;
                                        //takeOid(labels[i].name, &labelStr);
                                        getStringName(labels[i].name, 
&labelStr, mapi, mapbat, 1);      
-                                       fprintf(fout,"CS " BUNFMT " - FreqId %d 
- Name: %s  (Freq: %d) | Subject: <Not available>  | FreqParentIdx %d \n", 
cs.csId, i, labelStr, freq, cs.parentFreqIdx);
+                                       fprintf(fout,"CS " BUNFMT " - FreqId %d 
- Name: %s  (Freq: %d | Coverage: %d) | Subject: <Not available>  | 
FreqParentIdx %d \n", cs.csId, i, labelStr, freq, cs.coverage,cs.parentFreqIdx);
                                        GDKfree(labelStr); 
                                }
 
@@ -4387,7 +4392,7 @@ static void getStatisticCSsBySupports(BA
 #endif
 
 #if NO_OUTPUTFILE == 0
-static void getStatisticFinalCSs(CSset *freqCSset, BAT *sbat, int 
freqThreshold, int numTables, int* mergeCSFreqCSMap, CSPropTypes* csPropTypes){
+static void getStatisticFinalCSs(CSset *freqCSset, BAT *sbat, int 
freqThreshold, int numTables, int* mergeCSFreqCSMap, CSPropTypes* csPropTypes, 
CSlabel *labels){
 
        //int   *csPropNum; 
        //int   *csFreq; 
@@ -4416,7 +4421,7 @@ static void getStatisticFinalCSs(CSset *
 
        for (i = 0; i < numTables; i++){
                freqId = mergeCSFreqCSMap[i]; 
-               if (isCSTable(freqCSset->items[freqId])){               // 
Check whether it is a maximumCS
+               if (isCSTable(freqCSset->items[freqId],labels[freqId].name)){   
        // Check whether it is a maximumCS
                        // Output the result 
                        fprintf(fout, BUNFMT " %d  %d  %d\n", 
freqCSset->items[freqId].csId, 
freqCSset->items[freqId].numProp,freqCSset->items[freqId].support, 
freqCSset->items[freqId].coverage); 
                        if (freqCSset->items[freqId].coverage > maxNumtriple) 
maxNumtriple = freqCSset->items[freqId].coverage;
@@ -4444,7 +4449,7 @@ static void getStatisticFinalCSs(CSset *
        }
        for (i = 0; i < numTables; i++){
                freqId = mergeCSFreqCSMap[i]; 
-               if (isCSTable(freqCSset->items[freqId])){               // 
Check whether it is a maximumCS
+               if (isCSTable(freqCSset->items[freqId], labels[freqId].name)){  
        // Check whether it is a maximumCS
                        // Output the result 
                        tmpNumProp = freqCSset->items[freqId].numProp;  
                        for (k = 1; k < 10; k++) {
@@ -4490,7 +4495,7 @@ static void getStatisticFinalCSs(CSset *
 
        for (i = 0; i < numTables; i++){
                freqId = mergeCSFreqCSMap[i]; 
-               if (isCSTable(freqCSset->items[freqId])){               // 
Check whether it is a maximumCS
+               if (isCSTable(freqCSset->items[freqId],labels[freqId].name)){   
        // Check whether it is a maximumCS
                        // Output the result 
                        if (freqCSset->items[freqId].coverage > maxNumtriple) 
maxNumtriple = freqCSset->items[freqId].coverage;
                        if (freqCSset->items[freqId].coverage < minNumtriple) 
minNumtriple = freqCSset->items[freqId].coverage;
@@ -7299,7 +7304,7 @@ CSrel* generateCsRelBetweenMergeFreqSet(
 /* Refine the relationship between mergeCS in order to create FK relationship 
between tables */
 
 static
-CSrel* getFKBetweenTableSet(CSrel *csrelFreqSet, CSset *freqCSset, 
CSPropTypes* csPropTypes, int* mfreqIdxTblIdxMapping, int numTables){
+CSrel* getFKBetweenTableSet(CSrel *csrelFreqSet, CSset *freqCSset, 
CSPropTypes* csPropTypes, int* mfreqIdxTblIdxMapping, int numTables, CSlabel 
*labels){
        int     i,j;
        int     from, to;
        int     toFreqId; 
@@ -7315,7 +7320,7 @@ CSrel* getFKBetweenTableSet(CSrel *csrel
        for (i = 0; i < numRel; ++i) {
                if (csrelFreqSet[i].numRef == 0) continue; // ignore CS without 
relations
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to