Changeset: 501766ea68a7 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=501766ea68a7
Modified Files:
monetdb5/extras/rdf/rdflabels.c
monetdb5/extras/rdf/rdflabels.h
monetdb5/extras/rdf/rdfschema.c
monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:
Make the rule assigning table name based on FK more strict.
- Check whether the number of references to that CS is greater than a certain
percentage of CS's frequency.
Identify good non-ontology type value:
- If the type value appears in more than e.g., 95% of CS's instances.
- Choose this type instead of similar ontology class --> fix problem with BSBM
diffs (truncated from 390 to 300 lines):
diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -495,7 +495,7 @@ void convertToSQL(CSset *freqCSset, Rela
for (i = 0; i < freqCSset->numCSadded; ++i) {
str labelStr, tmpStr;
- if (!isCSTable(freqCSset->items[i])) continue; // ignore
+ if (!isCSTable(freqCSset->items[i],labels[i].name)) continue;
// ignore
if (labels[i].name == BUN_NONE) {
fprintf(fout, "CREATE TABLE %s_"BUNFMT" (\nsubject
VARCHAR(10) PRIMARY KEY,\n", "DUMMY", freqCSset->items[i].csId); // TODO
underscores?
@@ -568,7 +568,7 @@ void convertToSQL(CSset *freqCSset, Rela
// add foreign key columns and add foreign keys
for (i = 0; i < freqCSset->numCSadded; ++i) {
- if (!isCSTable(freqCSset->items[i])) continue; // ignore
+ if (!isCSTable(freqCSset->items[i],labels[i].name)) continue;
// ignore
for (j = 0; j < labels[i].numProp; ++j) {
str propStr, tmpStr2;
@@ -700,7 +700,7 @@ void createSQLMetadata(CSset* freqCSset,
for (i = 0; i < freqCSset->numCSadded; ++i) {
CS cs = (CS) freqCSset->items[i];
- if (!isCSTable(cs)) continue; // ignore
+ if (!isCSTable(cs, labels[i].name)) continue; // ignore
if (csRelBetweenMergeFreqSet[i].numRef == 0) continue;
for (j = 0; j < cs.numProp; ++j) { // propNo in CS order
@@ -717,7 +717,7 @@ void createSQLMetadata(CSset* freqCSset,
int toId =
csRelBetweenMergeFreqSet[i].lstRefFreqIdx[k];
if (toId == -1) continue; // ignore
if (i == toId) continue; // ignore self
references
- if (!isCSTable(freqCSset->items[toId]))
continue;
+ if (!isCSTable(freqCSset->items[toId],
labels[toId].name)) continue;
if ((int) (100.0 *
csRelBetweenMergeFreqSet[i].lstCnt[k] / sum + 0.5) < FK_FREQ_THRESHOLD)
continue; // foreign key is not frequent enough
tblfrom = mfreqIdxTblIdxMapping[i];
tblto = mfreqIdxTblIdxMapping[toId];
@@ -741,7 +741,7 @@ void createSQLMetadata(CSset* freqCSset,
// print id -> table name
fout = fopen("tableIdFreq.csv", "wt");
for (i = 0; i < freqCSset->numCSadded; ++i) {
- if (!isCSTable(freqCSset->items[i])) continue; // ignore
+ if (!isCSTable(freqCSset->items[i], labels[i].name)) continue;
// ignore
if (labels[i].name == BUN_NONE) {
fprintf(fout, "%d,\"%s_"BUNFMT"\",%d\n", i, "DUMMY",
freqCSset->items[i].csId, freqCSset->items[i].support); // TODO underscores?
@@ -806,7 +806,7 @@ void printTxt(CSset* freqCSset, CSlabel*
str labelStrShort = NULL;
#endif
- if (!isCSTable(freqCSset->items[i])) continue; // ignore
+ if (!isCSTable(freqCSset->items[i], labels[i].name)) continue;
// ignore
if (labels[i].name == BUN_NONE) {
fprintf(fout, "%s (CS "BUNFMT"): ", "DUMMY",
freqCSset->items[i].csId);
@@ -1848,7 +1848,7 @@ void printUML2(CSset *freqCSset, CSlabel
// find biggest and smallest table
for (i = 0; i < freqCSset->numCSadded; ++i) {
CS cs = (CS) freqCSset->items[i];
- if (!isCSTable(cs)) continue; // ignore
+ if (!isCSTable(cs,labels[i].name)) continue; // ignore
// first values
if (smallest == -1) smallest = i;
@@ -1868,7 +1868,7 @@ void printUML2(CSset *freqCSset, CSlabel
#endif
CS cs = (CS) freqCSset->items[i];
- if (!isCSTable(cs)) continue; // ignore
+ if (!isCSTable(cs, labels[i].name)) continue; // ignore
// print header
width = (int) ((300 + 300 *
(log10(freqCSset->items[i].coverage) -
log10(freqCSset->items[smallest].coverage)) /
(log10(freqCSset->items[biggest].coverage) -
log10(freqCSset->items[smallest].coverage))) + 0.5); // width between 300 and
600 px, using logarithm
@@ -1951,7 +1951,7 @@ void printUML2(CSset *freqCSset, CSlabel
for (i = 0; i < freqCSset->numCSadded; ++i) {
CS cs = (CS) freqCSset->items[i];
- if (!isCSTable(cs)) continue; // ignore
+ if (!isCSTable(cs, labels[i].name)) continue; // ignore
for (j = 0; j < cs.numProp; ++j) {
str tmpStr;
@@ -2123,7 +2123,7 @@ void removeDuplicatedCandidates(CSlabel
*
*/
static
-void getTableName(CSlabel* label, int csIdx, int typeAttributesCount,
TypeAttributesFreq*** typeAttributesHistogram, int**
typeAttributesHistogramCount, TypeStat* typeStat, int typeStatCount, oid**
result,int** resultMatchedProp, int* resultCount, IncidentFKs* links, oid**
ontmetadata, int ontmetadataCount, BAT *ontmetaBat, OntClass *ontclassSet) {
+void getTableName(CSlabel* label, CSset* freqCSset, int csIdx, int
typeAttributesCount, TypeAttributesFreq*** typeAttributesHistogram, int**
typeAttributesHistogramCount, TypeStat* typeStat, int typeStatCount, oid**
result,int** resultMatchedProp, int* resultCount, IncidentFKs* links, oid**
ontmetadata, int ontmetadataCount, BAT *ontmetaBat, OntClass *ontclassSet) {
int i, j;
oid *tmpList;
int tmpListCount;
@@ -2346,29 +2346,34 @@ void getTableName(CSlabel* label, int cs
// --- FK ---
// add top3 fk values to list of candidates
if (links[csIdx].num > 0) {
- label->candidatesFK = MIN(3, links[csIdx].num);
- label->candidates = GDKrealloc(label->candidates, sizeof(oid) *
(label->candidatesCount + MIN(3, links[csIdx].num)));
- if (!label->candidates) fprintf(stderr, "ERROR: Couldn't
realloc memory!\n");
- for (i = 0; i < MIN(3, links[csIdx].num); ++i) {
- label->candidates[label->candidatesCount + i] =
links[csIdx].fks[0].prop;
+ //Only add the FK name, if its number of references is large
enought
+ if ((links[csIdx].fks[0].freq * 100) > (FK_MIN_REFER_PERCENTAGE
* freqCSset->items[csIdx].support)){
+ label->candidatesFK = MIN(3, links[csIdx].num);
+ label->candidates = GDKrealloc(label->candidates,
sizeof(oid) * (label->candidatesCount + MIN(3, links[csIdx].num)));
+ if (!label->candidates) fprintf(stderr, "ERROR:
Couldn't realloc memory!\n");
+ for (i = 0; i < MIN(3, links[csIdx].num); ++i) {
+ label->candidates[label->candidatesCount + i] =
links[csIdx].fks[0].prop;
+ }
+ label->candidatesCount += MIN(3, links[csIdx].num);
}
- label->candidatesCount += MIN(3, links[csIdx].num);
}
if (!nameFound) {
// incident foreign keys --> use the one with the most
occurances (num and freq)
if (links[csIdx].num > 0) {
- label->name = links[csIdx].fks[0].prop; // sorted
- nameFound = 1;
-
- #if INFO_WHERE_NAME_FROM
- label->isFK = 1;
- #endif
-
- #if INFO_NAME_FREQUENCY
- label->nameFreq = links[csIdx].fks[0].freq;
- label->ontologySimScore = 0.0;
- #endif
+ if ((links[csIdx].fks[0].freq * 100) >
(FK_MIN_REFER_PERCENTAGE * freqCSset->items[csIdx].support)){
+ label->name = links[csIdx].fks[0].prop; //
sorted
+ nameFound = 1;
+
+ #if INFO_WHERE_NAME_FROM
+ label->isFK = 1;
+ #endif
+
+ #if INFO_NAME_FREQUENCY
+ label->nameFreq = links[csIdx].fks[0].freq;
+ label->ontologySimScore = 0.0;
+ #endif
+ }
}
}
@@ -2467,7 +2472,7 @@ void getAllLabels(CSlabel* labels, CSset
CS cs = (CS) freqCSset->items[i];
// get table name
- getTableName(&labels[i], i, typeAttributesCount,
typeAttributesHistogram, typeAttributesHistogramCount, typeStat, typeStatCount,
result, resultMatchedProp, resultCount, links, ontmetadata, ontmetadataCount,
ontmetaBat, ontclassSet);
+ getTableName(&labels[i], freqCSset, i, typeAttributesCount,
typeAttributesHistogram, typeAttributesHistogramCount, typeStat, typeStatCount,
result, resultMatchedProp, resultCount, links, ontmetadata, ontmetadataCount,
ontmetaBat, ontclassSet);
// copy attribute oids (names)
labels[i].numProp = cs.numProp;
diff --git a/monetdb5/extras/rdf/rdflabels.h b/monetdb5/extras/rdf/rdflabels.h
--- a/monetdb5/extras/rdf/rdflabels.h
+++ b/monetdb5/extras/rdf/rdflabels.h
@@ -92,6 +92,7 @@ enum {
} RULE;
#define FK_FREQ_THRESHOLD 25 // X % of the targeted subjects have to
be in this table
+#define FK_MIN_REFER_PERCENTAGE 25 // To be consider as the name of a CS,
the FK have to point to at least FK_MIN_REFER_PERCENTAGE of all CS's instances
#define TYPE_FREQ_THRESHOLD 80 // X % of the type values have to be
this value
#define GOOD_TYPE_FREQ_THRESHOLD 95 // If a type appears really frequent in
that CS, it should be choosen
//#define ONTOLOGY_FREQ_THRESHOLD 0.4 // similarity threshold for tfidf
simularity for ontology classes
diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -192,14 +192,19 @@ char getStringName(oid objOid, str *objS
}
-char isCSTable(CS item){
+char isCSTable(CS item, oid name){
if (item.parentFreqIdx != -1) return 0;
if (item.type == DIMENSIONCS) return 1;
#if REMOVE_SMALL_TABLE
if (item.coverage < MINIMUM_TABLE_SIZE) return 0;
- #endif
+
+ //More strict with table which does not have name
+ if ((name == BUN_NONE) && item.support < MINIMUM_TABLE_SIZE) return 0;
+ #endif
+
+
return 1;
}
@@ -682,14 +687,14 @@ void printSubCSInformation(SubCSSet *sub
*
* */
static
-void initCSPropTypes(CSPropTypes* csPropTypes, CSset* freqCSset, int
numMergedCS){
+void initCSPropTypes(CSPropTypes* csPropTypes, CSset* freqCSset, int
numMergedCS, CSlabel *labels){
int numFreqCS = freqCSset->numCSadded;
int i, j, k ;
int id;
id = 0;
for (i = 0; i < numFreqCS; i++){
- if ( isCSTable(freqCSset->items[i]) ){ // Only use the
maximum or merge CS
+ if ( isCSTable(freqCSset->items[i], labels[i].name)){ // Only
use the maximum or merge CS
csPropTypes[id].freqCSId = i;
csPropTypes[id].numProp = freqCSset->items[i].numProp;
csPropTypes[id].numInfreqProp = 0;
@@ -2046,12 +2051,12 @@ str printMergedFreqCSSet(CSset *freqCSse
if (cs.subject != BUN_NONE){
takeOid(cs.subject, &subStr);
if (labels[i].name == BUN_NONE) {
- fprintf(fout,"CS " BUNFMT " -
FreqId %d - Name: %s (Freq: %d) | Subject: %s | FreqParentIdx %d \n",
cs.csId, i, "DUMMY", freq, subStr, cs.parentFreqIdx);
+ fprintf(fout,"CS " BUNFMT " -
FreqId %d - Name: %s (Freq: %d | Coverage: %d) | Subject: %s | FreqParentIdx
%d \n", cs.csId, i, "DUMMY", freq, cs.coverage, subStr, cs.parentFreqIdx);
} else {
str labelStr;
//takeOid(labels[i].name,
&labelStr);
getStringName(labels[i].name,
&labelStr, mapi, mapbat, 1);
- fprintf(fout,"CS " BUNFMT " -
FreqId %d - Name: %s (Freq: %d) (NameFreq: %d --> %.2f percent) | Subject: %s
| FreqParentIdx %d \n", cs.csId, i, labelStr, freq, labels[i].nameFreq, (float)
labels[i].nameFreq/freq * 100, subStr, cs.parentFreqIdx);
+ fprintf(fout,"CS " BUNFMT " -
FreqId %d - Name: %s (Freq: %d | Coverage: %d) (NameFreq: %d --> %.2f percent)
| Subject: %s | FreqParentIdx %d \n", cs.csId, i, labelStr, freq,
cs.coverage,labels[i].nameFreq, (float) labels[i].nameFreq/freq * 100, subStr,
cs.parentFreqIdx);
GDKfree(labelStr);
}
@@ -2059,12 +2064,12 @@ str printMergedFreqCSSet(CSset *freqCSse
}
else{
if (labels[i].name == BUN_NONE) {
- fprintf(fout,"CS " BUNFMT " -
FreqId %d - Name: %s (Freq: %d) | FreqParentIdx %d \n", cs.csId, i, "DUMMY",
freq, cs.parentFreqIdx);
+ fprintf(fout,"CS " BUNFMT " -
FreqId %d - Name: %s (Freq: %d | Coverage: %d) | FreqParentIdx %d \n",
cs.csId, i, "DUMMY", freq, cs.coverage,cs.parentFreqIdx);
} else {
str labelStr;
//takeOid(labels[i].name,
&labelStr);
getStringName(labels[i].name,
&labelStr, mapi, mapbat, 1);
- fprintf(fout,"CS " BUNFMT " -
FreqId %d - Name: %s (Freq: %d) (NameFreq: %d --> %.2f percent) |
FreqParentIdx %d \n", cs.csId, i, labelStr, freq, labels[i].nameFreq, (float)
labels[i].nameFreq/freq * 100, cs.parentFreqIdx);
+ fprintf(fout,"CS " BUNFMT " -
FreqId %d - Name: %s (Freq: %d | Coverage: %d) (NameFreq: %d --> %.2f percent)
| FreqParentIdx %d \n", cs.csId, i, labelStr, freq,
cs.coverage,labels[i].nameFreq, (float) labels[i].nameFreq/freq * 100,
cs.parentFreqIdx);
GDKfree(labelStr);
}
}
@@ -2072,12 +2077,12 @@ str printMergedFreqCSSet(CSset *freqCSse
else {
if (labels[i].name == BUN_NONE) {
- fprintf(fout,"CS " BUNFMT " - FreqId %d
- Name: %s (Freq: %d) | Subject: <Not available> | FreqParentIdx %d \n",
cs.csId, i, "DUMMY", freq, cs.parentFreqIdx);
+ fprintf(fout,"CS " BUNFMT " - FreqId %d
- Name: %s (Freq: %d | Coverage: %d) | Subject: <Not available> |
FreqParentIdx %d \n", cs.csId, i, "DUMMY", freq, cs.coverage,cs.parentFreqIdx);
} else {
str labelStr;
//takeOid(labels[i].name, &labelStr);
getStringName(labels[i].name,
&labelStr, mapi, mapbat, 1);
- fprintf(fout,"CS " BUNFMT " - FreqId %d
- Name: %s (Freq: %d) | Subject: <Not available> | FreqParentIdx %d \n",
cs.csId, i, labelStr, freq, cs.parentFreqIdx);
+ fprintf(fout,"CS " BUNFMT " - FreqId %d
- Name: %s (Freq: %d | Coverage: %d) | Subject: <Not available> |
FreqParentIdx %d \n", cs.csId, i, labelStr, freq, cs.coverage,cs.parentFreqIdx);
GDKfree(labelStr);
}
@@ -4387,7 +4392,7 @@ static void getStatisticCSsBySupports(BA
#endif
#if NO_OUTPUTFILE == 0
-static void getStatisticFinalCSs(CSset *freqCSset, BAT *sbat, int
freqThreshold, int numTables, int* mergeCSFreqCSMap, CSPropTypes* csPropTypes){
+static void getStatisticFinalCSs(CSset *freqCSset, BAT *sbat, int
freqThreshold, int numTables, int* mergeCSFreqCSMap, CSPropTypes* csPropTypes,
CSlabel *labels){
//int *csPropNum;
//int *csFreq;
@@ -4416,7 +4421,7 @@ static void getStatisticFinalCSs(CSset *
for (i = 0; i < numTables; i++){
freqId = mergeCSFreqCSMap[i];
- if (isCSTable(freqCSset->items[freqId])){ //
Check whether it is a maximumCS
+ if (isCSTable(freqCSset->items[freqId],labels[freqId].name)){
// Check whether it is a maximumCS
// Output the result
fprintf(fout, BUNFMT " %d %d %d\n",
freqCSset->items[freqId].csId,
freqCSset->items[freqId].numProp,freqCSset->items[freqId].support,
freqCSset->items[freqId].coverage);
if (freqCSset->items[freqId].coverage > maxNumtriple)
maxNumtriple = freqCSset->items[freqId].coverage;
@@ -4444,7 +4449,7 @@ static void getStatisticFinalCSs(CSset *
}
for (i = 0; i < numTables; i++){
freqId = mergeCSFreqCSMap[i];
- if (isCSTable(freqCSset->items[freqId])){ //
Check whether it is a maximumCS
+ if (isCSTable(freqCSset->items[freqId], labels[freqId].name)){
// Check whether it is a maximumCS
// Output the result
tmpNumProp = freqCSset->items[freqId].numProp;
for (k = 1; k < 10; k++) {
@@ -4490,7 +4495,7 @@ static void getStatisticFinalCSs(CSset *
for (i = 0; i < numTables; i++){
freqId = mergeCSFreqCSMap[i];
- if (isCSTable(freqCSset->items[freqId])){ //
Check whether it is a maximumCS
+ if (isCSTable(freqCSset->items[freqId],labels[freqId].name)){
// Check whether it is a maximumCS
// Output the result
if (freqCSset->items[freqId].coverage > maxNumtriple)
maxNumtriple = freqCSset->items[freqId].coverage;
if (freqCSset->items[freqId].coverage < minNumtriple)
minNumtriple = freqCSset->items[freqId].coverage;
@@ -7299,7 +7304,7 @@ CSrel* generateCsRelBetweenMergeFreqSet(
/* Refine the relationship between mergeCS in order to create FK relationship
between tables */
static
-CSrel* getFKBetweenTableSet(CSrel *csrelFreqSet, CSset *freqCSset,
CSPropTypes* csPropTypes, int* mfreqIdxTblIdxMapping, int numTables){
+CSrel* getFKBetweenTableSet(CSrel *csrelFreqSet, CSset *freqCSset,
CSPropTypes* csPropTypes, int* mfreqIdxTblIdxMapping, int numTables, CSlabel
*labels){
int i,j;
int from, to;
int toFreqId;
@@ -7315,7 +7320,7 @@ CSrel* getFKBetweenTableSet(CSrel *csrel
for (i = 0; i < numRel; ++i) {
if (csrelFreqSet[i].numRef == 0) continue; // ignore CS without
relations
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list