Changeset: cc4754ecf9c8 for MonetDB URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=cc4754ecf9c8 Added Files: monetdb5/extras/rdf/ontmetadata/ontAttribute.bsbm.csv monetdb5/extras/rdf/ontmetadata/ontMetadata.bsbm.csv Modified Files: monetdb5/extras/rdf/ontmetadata/loadOntologyToMonetDB.sh monetdb5/extras/rdf/rdflabels.c monetdb5/extras/rdf/rdflabels.h monetdb5/extras/rdf/rdfschema.c monetdb5/extras/rdf/rdfschema.h Branch: rdf Log Message:
Modify to get the exact schema for BSBM - Add ontology for bsbm - If merge 2 dimension CSs, return a dimension CS. diffs (truncated from 326 to 300 lines): diff --git a/monetdb5/extras/rdf/ontmetadata/loadOntologyToMonetDB.sh b/monetdb5/extras/rdf/ontmetadata/loadOntologyToMonetDB.sh --- a/monetdb5/extras/rdf/ontmetadata/loadOntologyToMonetDB.sh +++ b/monetdb5/extras/rdf/ontmetadata/loadOntologyToMonetDB.sh @@ -77,6 +77,19 @@ sed -i "s:AttFile:${PWD}/ontAttribute.fo mclient < loadtmp.sql +#bsbm +NUMMETADATA=`cat ontMetadata.bsbm.csv | wc -l` +NUMATTRIBUTES=`cat ontAttribute.bsbm.csv | wc -l` + +cp loadOntologySAMPLE.sql loadtmp.sql +sed -i "s:NUMMETADATA:$NUMMETADATA:g" loadtmp.sql +sed -i "s:NUMATTRIBUTES:$NUMATTRIBUTES:g" loadtmp.sql +sed -i "s:MetaFile:${PWD}/ontMetadata.bsbm.csv:g" loadtmp.sql +sed -i "s:AttFile:${PWD}/ontAttribute.bsbm.csv:g" loadtmp.sql + + +mclient < loadtmp.sql + #List of possible ontologies NUMONT=`cat ontList.csv | wc -l` diff --git a/monetdb5/extras/rdf/ontmetadata/ontAttribute.bsbm.csv b/monetdb5/extras/rdf/ontmetadata/ontAttribute.bsbm.csv new file mode 100644 --- /dev/null +++ b/monetdb5/extras/rdf/ontmetadata/ontAttribute.bsbm.csv @@ -0,0 +1,10 @@ +http://www4.wiwiss.fu-berlin.de/bizer/bsbm/v01/vocabulary/ProductFeature|NULL +http://www4.wiwiss.fu-berlin.de/bizer/bsbm/v01/vocabulary/ProductType|NULL +http://www4.wiwiss.fu-berlin.de/bizer/bsbm/v01/vocabulary/Producer|NULL +http://www4.wiwiss.fu-berlin.de/bizer/bsbm/v01/vocabulary/Product|NULL +http://www4.wiwiss.fu-berlin.de/bizer/bsbm/v01/vocabulary/ProductTypeProduct|NULL +http://www4.wiwiss.fu-berlin.de/bizer/bsbm/v01/vocabulary/ProductFeatureProduct|NULL +http://www4.wiwiss.fu-berlin.de/bizer/bsbm/v01/vocabulary/Vendor|NULL +http://www4.wiwiss.fu-berlin.de/bizer/bsbm/v01/vocabulary/Offer|NULL +http://www4.wiwiss.fu-berlin.de/bizer/bsbm/v01/vocabulary/Person|NULL +http://www4.wiwiss.fu-berlin.de/bizer/bsbm/v01/vocabulary/Review|NULL diff --git a/monetdb5/extras/rdf/ontmetadata/ontMetadata.bsbm.csv b/monetdb5/extras/rdf/ontmetadata/ontMetadata.bsbm.csv new file mode 100644 --- /dev/null +++ b/monetdb5/extras/rdf/ontmetadata/ontMetadata.bsbm.csv @@ -0,0 +1,10 @@ +http://www4.wiwiss.fu-berlin.de/bizer/bsbm/v01/vocabulary/ProductFeature|ProductFeature|http://www.w3.org/2002/07/owl#Thing +http://www4.wiwiss.fu-berlin.de/bizer/bsbm/v01/vocabulary/ProductType|ProductType|http://www.w3.org/2002/07/owl#Thing +http://www4.wiwiss.fu-berlin.de/bizer/bsbm/v01/vocabulary/Producer|Producer|http://www.w3.org/2002/07/owl#Thing +http://www4.wiwiss.fu-berlin.de/bizer/bsbm/v01/vocabulary/Product|Product|http://www.w3.org/2002/07/owl#Thing +http://www4.wiwiss.fu-berlin.de/bizer/bsbm/v01/vocabulary/ProductTypeProduct|ProductTypeProduct|http://www.w3.org/2002/07/owl#Thing +http://www4.wiwiss.fu-berlin.de/bizer/bsbm/v01/vocabulary/ProductFeatureProduct|ProductFeatureProduct|http://www.w3.org/2002/07/owl#Thing +http://www4.wiwiss.fu-berlin.de/bizer/bsbm/v01/vocabulary/Vendor|Vendor|http://www.w3.org/2002/07/owl#Thing +http://www4.wiwiss.fu-berlin.de/bizer/bsbm/v01/vocabulary/Offer|Offer|http://www.w3.org/2002/07/owl#Thing +http://www4.wiwiss.fu-berlin.de/bizer/bsbm/v01/vocabulary/Person|Person|http://www.w3.org/2002/07/owl#Thing +http://www4.wiwiss.fu-berlin.de/bizer/bsbm/v01/vocabulary/Review|Review|http://www.w3.org/2002/07/owl#Thing diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c --- a/monetdb5/extras/rdf/rdflabels.c +++ b/monetdb5/extras/rdf/rdflabels.c @@ -856,11 +856,13 @@ static void insertValuesIntoTypeAttributesHistogram(oid* typeList, int typeListLength, TypeAttributesFreq*** typeAttributesHistogram, int** typeAttributesHistogramCount, int csFreqIdx, int type, BAT *ontmetaBat) { int i, j; int fit; + (void) ontmetaBat; for (i = 0; i < typeListLength; ++i) { + #if ONLY_USE_ONTOLOGYBASED_TYPE BUN pos = BUNfnd(BATmirror(ontmetaBat), &typeList[i]); if (pos == BUN_NONE) continue; // no ontology information, ignore - + #endif // add to histogram fit = 0; for (j = 0; j < typeAttributesHistogramCount[csFreqIdx][type]; ++j) { @@ -2079,7 +2081,11 @@ void getTableName(CSlabel* label, int cs oid maxDepthOid; int maxFreq; - + //for choosing the right type values + BUN ontClassPos; + oid typeOid; + int depth, maxDepth; + int freq; (void) ontmetaBat; @@ -2087,6 +2093,7 @@ void getTableName(CSlabel* label, int cs // get most frequent type value per type attribute tmpList = NULL; tmpListCount = 0; + for (i = 0; i < typeAttributesCount; ++i) { if (typeAttributesHistogramCount[csIdx][i] == 0) continue; /* //TODO: Uncomment this path @@ -2109,6 +2116,7 @@ void getTableName(CSlabel* label, int cs } } */ + if (typeAttributesHistogram[csIdx][i][0].percent < TYPE_FREQ_THRESHOLD) continue; // sorted tmpList = (oid *) realloc(tmpList, sizeof(oid) * (tmpListCount + 1)); if (!tmpList) fprintf(stderr, "ERROR: Couldn't realloc memory!\n"); @@ -2116,23 +2124,40 @@ void getTableName(CSlabel* label, int cs // of all values that are >= TYPE_FREQ_THRESHOLD, choose the value with the highest hierarchy level ("deepest" value) maxDepthOid = typeAttributesHistogram[csIdx][i][0].value; maxFreq = typeAttributesHistogram[csIdx][i][0].freq; + ontClassPos = BUNfnd(BATmirror(ontmetaBat), &maxDepthOid); + if ( ontClassPos != BUN_NONE){ + maxDepth = ontclassSet[ontClassPos].hierDepth; + } + else{ + maxDepth = -1; + } + + for (j = 1; j < typeAttributesHistogramCount[csIdx][i]; ++j) { - int depth, maxDepth; - int freq; + if (typeAttributesHistogram[csIdx][i][j].percent < TYPE_FREQ_THRESHOLD) break; - depth = ontclassSet[BUNfnd(BATmirror(ontmetaBat), &typeAttributesHistogram[csIdx][i][j].value)].hierDepth; - maxDepth = ontclassSet[BUNfnd(BATmirror(ontmetaBat), &maxDepthOid)].hierDepth; - freq = typeAttributesHistogram[csIdx][i][j].freq; - if (depth > maxDepth) { - // choose value with higher hierarchy level - maxDepthOid = typeAttributesHistogram[csIdx][i][j].value; - maxFreq = freq; - } else if (depth == maxDepth && freq > maxFreq) { - // if both values are on the same level, choose the value with higher frequency - maxDepthOid = typeAttributesHistogram[csIdx][i][j].value; - maxFreq = freq; + + typeOid = typeAttributesHistogram[csIdx][i][j].value; + ontClassPos = BUNfnd(BATmirror(ontmetaBat), &typeOid); + if (ontClassPos != BUN_NONE){ + depth = ontclassSet[ontClassPos].hierDepth; + freq = typeAttributesHistogram[csIdx][i][j].freq; + + if (depth > maxDepth) { + // choose value with higher hierarchy level + maxDepthOid = typeAttributesHistogram[csIdx][i][j].value; + maxFreq = freq; + maxDepth = depth; + } else if (depth == maxDepth && freq > maxFreq) { + // if both values are on the same level, choose the value with higher frequency + maxDepthOid = typeAttributesHistogram[csIdx][i][j].value; + maxFreq = freq; + } } } + + // + tmpList[tmpListCount] = maxDepthOid; tmpListCount += 1; } @@ -2185,8 +2210,7 @@ void getTableName(CSlabel* label, int cs } } } - - + // --- ONTOLOGY --- // add all ontology candidates to list of candidates if (resultCount[csIdx] >= 1) { diff --git a/monetdb5/extras/rdf/rdflabels.h b/monetdb5/extras/rdf/rdflabels.h --- a/monetdb5/extras/rdf/rdflabels.h +++ b/monetdb5/extras/rdf/rdflabels.h @@ -101,6 +101,7 @@ enum { #define USE_ONTOLOGY_NAMES 1 // use ontology classes for labeling #define USE_TABLE_NAME 1 // calculate and store the final labels #define SHOW_CANDIDATES 0 // inserts a row in UML diagrams to show all candidate names +#define ONLY_USE_ONTOLOGYBASED_TYPE 0 rdf_export void getPropNameShort(char** name, char* propStr); diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c --- a/monetdb5/extras/rdf/rdfschema.c +++ b/monetdb5/extras/rdf/rdfschema.c @@ -1478,7 +1478,11 @@ CS* mergeTwoCSs(CS cs1, CS cs2, int freq int numCombineP = 0; CS *mergecs = (CS*) malloc (sizeof (CS)); - mergecs->type = (char)MERGECS; + if (cs1.type == DIMENSIONCS && cs2.type == DIMENSIONCS) + mergecs->type = DIMENSIONCS; + else + mergecs->type = (char)MERGECS; + mergecs->numConsistsOf = 2; mergecs->lstConsistsOf = (int*) malloc(sizeof(int) * 2); @@ -2527,7 +2531,7 @@ oid putaCStoHash(CSBats *csBats, oid* ke csId = *csoid; addNewCS(csBats, fullPropStat, &csKey, key, csoid, num, numTriples, numTypeValues, rdftypeOntologyValues); - //if (csId == 73){ + //if (csId == 2){ // printf("Extra info for cs 73 is: "); // printTKNZStringFromOid(rdftypeOntologyValues[0]); //} @@ -3471,6 +3475,10 @@ str mergeMaxFreqCSByS1(CSset *freqCSset, (void) name; (void) ontmetadata; (void) ontmetadataCount; + #if !NOT_MERGE_DIMENSIONCS_IN_S1 + (void) cs1; + (void) cs2; + #endif labelStat = initLabelStat(); buildLabelStat(labelStat, (*labels), freqCSset, TOPK); printf("Num FreqCSadded before using S1 = %d \n", freqCSset->numCSadded); @@ -3502,7 +3510,7 @@ str mergeMaxFreqCSByS1(CSset *freqCSset, freqId1 = labelStat->freqIdList[i][k]; if ((*labels)[freqId1].isOntology == 1) { cs1 = &(freqCSset->items[freqId1]); - #if NOT_MERGE_DIMENSIONCS + #if NOT_MERGE_DIMENSIONCS_IN_S1 if (cs1->type == DIMENSIONCS) continue; #endif tmpCount++; @@ -3512,7 +3520,7 @@ str mergeMaxFreqCSByS1(CSset *freqCSset, for (j = k+1; j < labelStat->lstCount[i]; j++){ freqId2 = labelStat->freqIdList[i][j]; cs2 = &(freqCSset->items[freqId2]); - #if NOT_MERGE_DIMENSIONCS + #if NOT_MERGE_DIMENSIONCS_IN_S1 if (cs2->type == DIMENSIONCS) continue; #endif @@ -3533,7 +3541,7 @@ str mergeMaxFreqCSByS1(CSset *freqCSset, freqId1 = labelStat->freqIdList[i][k]; if ((*labels)[freqId1].isType == 1) { cs1 = &(freqCSset->items[freqId1]); - #if NOT_MERGE_DIMENSIONCS + #if NOT_MERGE_DIMENSIONCS_IN_S1 if (cs1->type == DIMENSIONCS) continue; #endif tmpCount++; @@ -3543,7 +3551,7 @@ str mergeMaxFreqCSByS1(CSset *freqCSset, for (j = k+1; j < labelStat->lstCount[i]; j++){ freqId2 = labelStat->freqIdList[i][j]; cs2 = &(freqCSset->items[freqId2]); - #if NOT_MERGE_DIMENSIONCS + #if NOT_MERGE_DIMENSIONCS_IN_S1 if (cs2->type == DIMENSIONCS) continue; #endif if ((*labels)[freqId2].isType == 1){ @@ -3563,7 +3571,7 @@ str mergeMaxFreqCSByS1(CSset *freqCSset, freqId1 = labelStat->freqIdList[i][k]; if ((*labels)[freqId1].isFK == 1) { cs1 = &(freqCSset->items[freqId1]); - #if NOT_MERGE_DIMENSIONCS + #if NOT_MERGE_DIMENSIONCS_IN_S1 if (cs1->type == DIMENSIONCS) continue; #endif tmpCount++; @@ -3573,7 +3581,7 @@ str mergeMaxFreqCSByS1(CSset *freqCSset, for (j = k+1; j < labelStat->lstCount[i]; j++){ freqId2 = labelStat->freqIdList[i][j]; cs2 = &(freqCSset->items[freqId2]); - #if NOT_MERGE_DIMENSIONCS + #if NOT_MERGE_DIMENSIONCS_IN_S1 if (cs2->type == DIMENSIONCS) continue; #endif if ((*labels)[freqId2].isFK == 1){ @@ -3594,8 +3602,8 @@ str mergeMaxFreqCSByS1(CSset *freqCSset, for (k = 0; k < labelStat->lstCount[i]; k++){ freqId1 = labelStat->freqIdList[i][k]; cs1 = &(freqCSset->items[freqId1]); - #if NOT_MERGE_DIMENSIONCS - if (0) if (cs1->type == DIMENSIONCS) continue; + #if NOT_MERGE_DIMENSIONCS_IN_S1 + if (cs1->type == DIMENSIONCS) continue; #endif tmpCount++; break; @@ -3603,8 +3611,8 @@ str mergeMaxFreqCSByS1(CSset *freqCSset, for (j = k+1; j < labelStat->lstCount[i]; j++){ freqId2 = labelStat->freqIdList[i][j]; cs2 = &(freqCSset->items[freqId2]); - #if NOT_MERGE_DIMENSIONCS - if (0) if (cs2->type == DIMENSIONCS) continue; + #if NOT_MERGE_DIMENSIONCS_IN_S1 + if (cs2->type == DIMENSIONCS) continue; #endif doMerge(freqCSset, S1, freqId1, freqId2, mergecsId, labels, ontmetadata, ontmetadataCount, *name); tmpCount++; @@ -4175,11 +4183,22 @@ void mergeCSByS4(CSset *freqCSset, CSlab if (simscore > SIM_THRESHOLD) { #endif /* - takeOid((*labels)[freqId1].name, &freqCSname1); - takeOid((*labels)[freqId2].name, &freqCSname2); - printf("Merge %d (%s) and %d (%s) with simscore = %f \n",freqId1, freqCSname1, freqId2, freqCSname2, simscore); - GDKfree(freqCSname1); - GDKfree(freqCSname2); + if ((*labels)[freqId1].name != BUN_NONE){ + takeOid((*labels)[freqId1].name, &freqCSname1); + printf("Merge %d (%s) and ",freqId1, freqCSname1); + GDKfree(freqCSname1); _______________________________________________ checkin-list mailing list checkin-list@monetdb.org https://www.monetdb.org/mailman/listinfo/checkin-list