Changeset: cd56cb75623a for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=cd56cb75623a
Modified Files:
monetdb5/extras/rdf/rdfschema.c
monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:
Merge CSs having the same name regardless of where the name comes from.
+ Modify the function for output freqCS information in order to check the
relation between the name of a mergedCS and the name of its parents
diffs (287 lines):
diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -1842,11 +1842,11 @@ int* mergeMultiCS(CSset *freqCSset, int
#if NO_OUTPUTFILE == 0
static
-str printFreqCSSet(CSset *freqCSset, BAT *freqBat, BAT *mapbat, char
isWriteTofile, int freqThreshold, CSlabel* labels){
-
- int i;
- int j;
- int *freq;
+str printMergedFreqCSSet(CSset *freqCSset, BAT *mapbat, char isWriteTofile,
int freqThreshold, CSlabel* labels, int mergingstep){
+
+ int i,j;
+ int mergeCSid, tmpParentFreqId;
+ int freq;
FILE *fout;
char filename[100];
char tmpStr[20];
@@ -1872,13 +1872,14 @@ str printFreqCSSet(CSset *freqCSset, BAT
mapi = bat_iterator(mapbat);
#endif
-
+ mergeCSid = -1;
if (isWriteTofile == 0){
for (i = 0; i < freqCSset->numCSadded; i++){
CS cs = (CS)freqCSset->items[i];
- freq = (int *) Tloc(freqBat, cs.csId);
-
- printf("CS " BUNFMT " (Freq: %d) | Parent " BUNFMT "
\n", cs.csId, *freq, freqCSset->items[cs.parentFreqIdx].csId);
+ if (cs.parentFreqIdx != -1) continue;
+ freq = cs.support;
+
+ printf("CS " BUNFMT " (Freq: %d) | Parent " BUNFMT "
\n", cs.csId, freq, freqCSset->items[cs.parentFreqIdx].csId);
for (j = 0; j < cs.numProp; j++){
printf(" P:" BUNFMT " --> \n", cs.lstProp[j]);
}
@@ -1888,7 +1889,7 @@ str printFreqCSSet(CSset *freqCSset, BAT
else{
strcpy(filename, "freqCSFullInfo");
- sprintf(tmpStr, "%d", freqThreshold);
+ sprintf(tmpStr, "%d_%d", freqThreshold,mergingstep);
strcat(filename, tmpStr);
strcat(filename, ".txt");
@@ -1896,33 +1897,66 @@ str printFreqCSSet(CSset *freqCSset, BAT
for (i = 0; i < freqCSset->numCSadded; i++){
CS cs = (CS)freqCSset->items[i];
- freq = (int *) Tloc(freqBat, cs.csId);
- if (cs.type != MAXCS) assert(*freq == cs.support);
+ if (cs.parentFreqIdx != -1) continue;
+ mergeCSid++;
+ freq = cs.support;
#if STOREFULLCS
- if (cs.subject != BUN_NONE){
- takeOid(cs.subject, &subStr);
+ if (i < freqCSset->numOrigFreqCS){
+ if (cs.subject != BUN_NONE){
+ takeOid(cs.subject, &subStr);
+
+ if (labels[i].name == BUN_NONE) {
+ fprintf(fout,"CS " BUNFMT " -
FreqId %d - Name: %s (Freq: %d) | Subject: %s | FreqParentIdx %d \n",
cs.csId, i, "DUMMY", freq, subStr, cs.parentFreqIdx);
+ } else {
+ str labelStr;
+ takeOid(labels[i].name,
&labelStr);
+ fprintf(fout,"CS " BUNFMT " -
FreqId %d - Name: %s (Freq: %d) | Subject: %s | FreqParentIdx %d \n",
cs.csId, i, labelStr, freq, subStr, cs.parentFreqIdx);
+ GDKfree(labelStr);
+ }
+
+ GDKfree(subStr);
+ }
+ else{
+ if (labels[i].name == BUN_NONE) {
+ fprintf(fout,"CS " BUNFMT " -
FreqId %d - Name: %s (Freq: %d) | FreqParentIdx %d \n", cs.csId, i, "DUMMY",
freq, cs.parentFreqIdx);
+ } else {
+ str labelStr;
+ takeOid(labels[i].name,
&labelStr);
+ fprintf(fout,"CS " BUNFMT " -
FreqId %d - Name: %s (Freq: %d) | FreqParentIdx %d \n", cs.csId, i, labelStr,
freq, cs.parentFreqIdx);
+ GDKfree(labelStr);
+ }
+ }
+ }
+ else {
if (labels[i].name == BUN_NONE) {
- fprintf(fout,"CS " BUNFMT " - FreqId %d
- Name: %s (Freq: %d) | Subject: %s | FreqParentIdx %d \n", cs.csId, i,
"DUMMY", *freq, subStr, cs.parentFreqIdx);
+ fprintf(fout,"CS " BUNFMT " - FreqId %d
- Name: %s (Freq: %d) | Subject: <Not available> | FreqParentIdx %d \n",
cs.csId, i, "DUMMY", freq, cs.parentFreqIdx);
} else {
str labelStr;
takeOid(labels[i].name, &labelStr);
- fprintf(fout,"CS " BUNFMT " - FreqId %d
- Name: %s (Freq: %d) | Subject: %s | FreqParentIdx %d \n", cs.csId, i,
labelStr, *freq, subStr, cs.parentFreqIdx);
+ fprintf(fout,"CS " BUNFMT " - FreqId %d
- Name: %s (Freq: %d) | Subject: <Not available> | FreqParentIdx %d \n",
cs.csId, i, labelStr, freq, cs.parentFreqIdx);
GDKfree(labelStr);
}
- GDKfree(subStr);
- }
- else{
- if (labels[i].name == BUN_NONE) {
- fprintf(fout,"CS " BUNFMT " - FreqId %d
- Name: %s (Freq: %d) | FreqParentIdx %d \n", cs.csId, i, "DUMMY", *freq,
cs.parentFreqIdx);
- } else {
- str labelStr;
- takeOid(labels[i].name, &labelStr);
- fprintf(fout,"CS " BUNFMT " - FreqId %d
- Name: %s (Freq: %d) | FreqParentIdx %d \n", cs.csId, i, labelStr, *freq,
cs.parentFreqIdx);
- GDKfree(labelStr);
+
+ fprintf(fout, "MergeCS %d (Number of parent: %d
| FreqId list: ) \n",mergeCSid, cs.numConsistsOf);
+ for (j = 0; j < cs.numConsistsOf; j++){
+ tmpParentFreqId = cs.lstConsistsOf[j];
+ fprintf(fout, " %d
[F:%d]",tmpParentFreqId, freqCSset->items[tmpParentFreqId].support);
+ if (labels[tmpParentFreqId].name ==
BUN_NONE) fprintf(fout, "[DUMMY] ");
+ else{
+ str labelStr = NULL;
+ str labelShortStr = NULL;
+
takeOid(labels[tmpParentFreqId].name, &labelStr);
+
getPropNameShort(&labelShortStr,labelStr);
+ fprintf(fout, "[%s]
",labelShortStr);
+ GDKfree(labelShortStr);
+ GDKfree(labelStr);
+ }
+
}
+ fprintf(fout, "\n");
}
#endif
@@ -1935,6 +1969,10 @@ str printFreqCSSet(CSset *freqCSset, BAT
#if STOREFULLCS
// Get object value
+ if (i >= freqCSset->numOrigFreqCS){
+ fprintf(fout, " <No Object value> \n");
+ continue;
+ }
if (cs.lstObj != NULL){
objOid = cs.lstObj[j];
@@ -1956,6 +1994,9 @@ str printFreqCSSet(CSset *freqCSset, BAT
GDKfree(objStr);
}
}
+ else{
+ fprintf(fout, " <No Object value> \n");
+ }
#endif
@@ -2466,10 +2507,6 @@ oid putaCStoHash(CSBats *csBats, oid* ke
csId = *csoid;
addNewCS(csBats, fullPropStat, &csKey, key, csoid, num,
numTriples, numTypeValues, rdftypeOntologyValues);
- if (csId == 309){
- printf("Extra info is "BUNFMT "\n",
rdftypeOntologyValues[0]);
- }
-
//Handle the case when freqThreshold == 1
if (isStoreFreqCS ==1 && freqThreshold == 1){
#if STOREFULLCS
@@ -3412,6 +3449,7 @@ str mergeMaxFreqCSByS1(CSset *freqCSset,
}
#else
+ #if MERGING_CONSIDER_NAMEORIGINALITY
//For ontology name
tmpCount = 0;
for (k = 0; k < labelStat->lstCount[i]; k++){
@@ -3499,12 +3537,42 @@ str mergeMaxFreqCSByS1(CSset *freqCSset,
tmpCount++;
}
}
+
+ #if OUTPUT_FREQID_PER_LABEL
+ fprintf(fout, " %d freqCS merged as having same name by
FK. MergedCS has %d prop. \n", tmpCount, freqCSset->items[freqCSset->numCSadded
-1].numProp);
+ #endif
+
+ #else //MERGING_CONSIDER_NAMEORIGINALITY == 0
+
+ tmpCount = 0;
+ for (k = 0; k < labelStat->lstCount[i]; k++){
+ freqId1 = labelStat->freqIdList[i][k];
+ cs1 = &(freqCSset->items[freqId1]);
+ #if NOT_MERGE_DIMENSIONCS
+ if (cs1->type == DIMENSIONCS) continue;
+ #endif
+ tmpCount++;
+ break;
+ }
+ for (j = k+1; j < labelStat->lstCount[i]; j++){
+ freqId2 = labelStat->freqIdList[i][j];
+ cs2 = &(freqCSset->items[freqId2]);
+ #if NOT_MERGE_DIMENSIONCS
+ if (cs2->type == DIMENSIONCS) continue;
+ #endif
+ doMerge(freqCSset, S1, freqId1, freqId2,
mergecsId, labels, ontmetadata, ontmetadataCount, *name);
+ tmpCount++;
+ }
+
+ #if OUTPUT_FREQID_PER_LABEL
+ fprintf(fout, " %d freqCS merged as having same name
(by Ontology, Type, FK). MergedCS has %d prop. \n", tmpCount,
freqCSset->items[freqCSset->numCSadded -1].numProp);
+ #endif
+
+ #endif
+
#endif /* USE_MULTIWAY_MERGING */
#if OUTPUT_FREQID_PER_LABEL
-
- fprintf(fout, " %d freqCS merged as having same name by
FK. MergedCS has %d prop. \n", tmpCount, freqCSset->items[freqCSset->numCSadded
-1].numProp);
-
takeOid(*name, &tmpLabel);
#if USE_SHORT_NAMES
getPropNameShort(&canStrShort, tmpLabel);
@@ -6927,7 +6995,7 @@ RDFextractCSwithTypes(int *ret, bat *sba
tmpLastT = curT;
#if NO_OUTPUTFILE == 0
- printFreqCSSet(freqCSset, csBats->freqBat, mbat, 1, *freqThreshold,
*labels);
+ printMergedFreqCSSet(freqCSset, mbat, 1, *freqThreshold, *labels, 0);
#endif
//return "Error";
@@ -6998,6 +7066,10 @@ RDFextractCSwithTypes(int *ret, bat *sba
printf("Merging with S1 took %f. (Number of mergeCS: %d | NumconsistOf:
%d) \n", ((float)(curT - tmpLastT))/CLOCKS_PER_SEC, curNumMergeCS,
countNumberConsistOfCS(freqCSset));
printf("Number of added CS after S1: %d \n", freqCSset->numCSadded);
+ #if NO_OUTPUTFILE == 0
+ printMergedFreqCSSet(freqCSset, mbat, 1, *freqThreshold, *labels, 1);
+ #endif
+
#if STORE_PERFORMANCE_METRIC_INFO
computeMetricsQ(freqCSset);
#endif
@@ -7007,6 +7079,7 @@ RDFextractCSwithTypes(int *ret, bat *sba
mergeCSFreqCSMap = (oid*) malloc(sizeof(oid) * curNumMergeCS);
initMergeCSFreqCSMap(freqCSset, mergeCSFreqCSMap);
+ if (0){
/*S4: Merge two CS's having the subset-superset relationship */
mergeCSbyS4(freqCSset, labels, mergeCSFreqCSMap,curNumMergeCS,
ontmetadata, ontmetadataCount);
@@ -7018,9 +7091,9 @@ RDFextractCSwithTypes(int *ret, bat *sba
#if STORE_PERFORMANCE_METRIC_INFO
computeMetricsQ(freqCSset);
#endif
-
+
tmpLastT = curT;
-
+ }
/* ---------- S6 ------- */
free(mergeCSFreqCSMap);
mergeCSFreqCSMap = (oid*) malloc(sizeof(oid) * curNumMergeCS);
@@ -7028,7 +7101,7 @@ RDFextractCSwithTypes(int *ret, bat *sba
/* S6: Merged CS referred from the same CS via the same property */
- if (1){
+ if (0){
tmpCSrelToMergeCS = generateCsRelToMergeFreqSet(csrelSet, freqCSset);
tmpNumRel = freqCSset->numCSadded;
@@ -7059,6 +7132,9 @@ RDFextractCSwithTypes(int *ret, bat *sba
curNumMergeCS = countNumberMergeCS(freqCSset);
curT = clock();
printf ("Merging with S3, S5 took %f. (Number of mergeCS: %d)
\n",((float)(curT - tmpLastT))/CLOCKS_PER_SEC, curNumMergeCS);
+ #if NO_OUTPUTFILE == 0
+ printMergedFreqCSSet(freqCSset, mbat, 1, *freqThreshold, *labels, 5);
+ #endif
#if STORE_PERFORMANCE_METRIC_INFO
computeMetricsQ(freqCSset);
diff --git a/monetdb5/extras/rdf/rdfschema.h b/monetdb5/extras/rdf/rdfschema.h
--- a/monetdb5/extras/rdf/rdfschema.h
+++ b/monetdb5/extras/rdf/rdfschema.h
@@ -122,6 +122,7 @@ typedef struct PropStat {
the freqIdx of merged CS from previous
rule. */
#define OUTPUT_FREQID_PER_LABEL 1 /* This is for evaluating the results
of merging using S1. TODO: Set it to 0 for default*/
+#define MERGING_CONSIDER_NAMEORIGINALITY 0 /*Merging in rule S1,
considering where the name comes from (e.g., from Ontology, from rdf:type, or
from FK) */
#define IS_MULVALUE_THRESHOLD 1.1 /* The ratio betweeen (the number of
triple coverred by Prop P) / (number of Non-NULL object values for P)
If this ratio is ~1, only use single
value column for that prop
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list