Changeset: af6b114f1b3a for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=af6b114f1b3a
Modified Files:
monetdb5/extras/rdf/rdf.h
monetdb5/extras/rdf/rdflabels.c
monetdb5/extras/rdf/rdfschema.c
monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:
Tune S1 by using alternative name for a freqCS, checking where the name comes
from. + validate TF-IDF function in rdflabels.c
diffs (truncated from 501 to 300 lines):
diff --git a/monetdb5/extras/rdf/rdf.h b/monetdb5/extras/rdf/rdf.h
--- a/monetdb5/extras/rdf/rdf.h
+++ b/monetdb5/extras/rdf/rdf.h
@@ -103,6 +103,11 @@ typedef enum {
#define N_GRAPH_BAT (MAP_LEX+1)
+#define INFO_WHERE_NAME_FROM 1
+#define TOP_GENERAL_NAME 2 //Level of hierrachy in which a name is
considered to be a general name
+ //For example, PERSON, THING is at level 1
+#define USE_ALTERNATIVE_NAME 0 //Use different but may be better name
for a general name
+
// Final data structure that stores the labels for tables and attributes
typedef struct CSlabel {
oid name; // table name
@@ -116,6 +121,11 @@ typedef struct CSlabel {
int hierarchyCount; // number of entries in the hierarchy
list
int numProp; // number of properties, copied from
freqCSset->items[x].numProp
oid *lstProp; // attribute names (same order as in
freqCSset->items[x].lstProp)
+ #if INFO_WHERE_NAME_FROM
+ char isOntology; // First name is decided by ontology
+ char isType; // First name is decided based on Type
+ char isFK;
+ #endif
} CSlabel;
#endif /* _RDF_H_ */
diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -1167,6 +1167,7 @@ oid* getOntologyCandidates(oid** ontattr
// remove subclass if superclass is in list
for (k = 0; k < num; ++k) {
int found = 0;
+ printf(" TFIDF score at %d is: %f \n",k,
classStat[k].tfidfs);
if (classStat[k].tfidfs < ONTOLOGY_FREQ_THRESHOLD)
break; // values not frequent enough (list is sorted by tfidfs)
for (j = 0; j < ontmetadataCount && (found == 0); ++j) {
oid muri = ontmetadata[0][j];
@@ -1346,8 +1347,19 @@ void createOntologyLookupResult(oid** re
for (j = 0; j < ontologyCount; ++j) {
propOntologiesCount[j] = 0;
}
+
+ printf("Get ontology for FreqId %d. Orignal numProp = %d \n",
i, cs.numProp);
+
propOntologies = findOntologies(cs, propOntologiesCount,
&propOntologiesOids);
+ /*
+ printf("Prop ontologies count. \n");
+ for (j = 0; j < ontologyCount; ++j) {
+ if (propOntologiesCount[j] > 0)
+ printf(" (%d) props in ontology %d \n ",
propOntologiesCount[j], j);
+ }
+ */
+
// get class names
resultCount[i] = 0;
result[i] = getOntologyCandidates(ontattributes,
ontattributesCount, ontmetadata, ontmetadataCount, &(resultCount[i]),
propOntologiesOids, propOntologiesCount, ontologyCount, propStat);
@@ -1970,6 +1982,9 @@ void getTableName(CSlabel* label, int cs
label->name = result[csIdx][0];
label->hierarchy = getOntoHierarchy(label->name,
&(label->hierarchyCount), ontmetadata, ontmetadataCount);
nameFound = 1;
+ #if INFO_WHERE_NAME_FROM
+ label->isOntology = 1;
+ #endif
}
if (!nameFound) {
@@ -2001,6 +2016,9 @@ void getTableName(CSlabel* label, int cs
label->hierarchy =
getOntoHierarchy(label->name, &(label->hierarchyCount), ontmetadata,
ontmetadataCount);
free(tmpList);
nameFound = 1;
+ #if INFO_WHERE_NAME_FROM
+ label->isOntology = 1;
+ #endif
}
if (!nameFound) {
@@ -2010,6 +2028,10 @@ void getTableName(CSlabel* label, int cs
label->hierarchy =
getOntoHierarchy(label->name, &(label->hierarchyCount), ontmetadata,
ontmetadataCount);
free(tmpList);
nameFound = 1;
+
+ #if INFO_WHERE_NAME_FROM
+ label->isOntology = 1;
+ #endif
}
}
@@ -2019,6 +2041,10 @@ void getTableName(CSlabel* label, int cs
label->hierarchy =
getOntoHierarchy(label->name, &(label->hierarchyCount), ontmetadata,
ontmetadataCount);
free(tmpList);
nameFound = 1;
+
+ #if INFO_WHERE_NAME_FROM
+ label->isOntology = 1;
+ #endif
}
}
}
@@ -2060,6 +2086,10 @@ void getTableName(CSlabel* label, int cs
// only one type attribute, use most frequent value
(sorted)
label->name = tmpList[0];
nameFound = 1;
+ #if INFO_WHERE_NAME_FROM
+ label->isType = 1;
+ #endif
+
}
}
@@ -2071,6 +2101,10 @@ void getTableName(CSlabel* label, int cs
if (typeStat[i].value == tmpList[j]) {
label->name = tmpList[j];
nameFound = 1;
+
+ #if INFO_WHERE_NAME_FROM
+ label->isType = 1;
+ #endif
}
}
}
@@ -2094,6 +2128,10 @@ void getTableName(CSlabel* label, int cs
if (links[csIdx].num > 0) {
label->name = links[csIdx].fks[0].prop; // sorted
nameFound = 1;
+
+ #if INFO_WHERE_NAME_FROM
+ label->isFK = 1;
+ #endif
}
}
@@ -2138,6 +2176,11 @@ CSlabel* initLabels(CSset *freqCSset) {
labels[i].hierarchyCount = 0;
labels[i].numProp = 0;
labels[i].lstProp = NULL;
+ #if INFO_WHERE_NAME_FROM
+ labels[i].isOntology = 0;
+ labels[i].isType = 0;
+ labels[i].isFK = 0;
+ #endif
}
return labels;
}
diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -2778,7 +2778,7 @@ void generatecsRelSum(CSrel csRel, int f
for (i = 0; i < csRel.numRef; i++){
freq = freqCSset->items[csRel.origFreqIdx].support;
- if (freq < csRel.lstCnt[i] * MIN_PERCETAGE_S6){
+ if (freq > MIN_FROMTABLE_SIZE_S6 && freq < csRel.lstCnt[i] *
MIN_PERCETAGE_S6){
propIdx = 0;
while (csRelSum->lstPropId[propIdx] !=
csRel.lstPropId[i])
propIdx++;
@@ -2811,6 +2811,38 @@ LabelStat* initLabelStat(void){
return labelStat;
}
+/*
+ *
+ * */
+#if USE_ALTERNATIVE_NAME
+static
+oid getMostSuitableName(CSlabel *labels, int freqIdx, int candIdx){
+ oid candidate;
+ int i;
+ candidate = labels[freqIdx].candidates[candIdx];
+
+ if (labels[freqIdx].hierarchyCount > 1){
+ for (i = 0; i < labels[freqIdx].hierarchyCount; i++){
+ if (labels[freqIdx].hierarchy[i] == candidate) break;
+ }
+
+ }
+
+ if (i == labels[freqIdx].hierarchyCount) // Not appears in the
hierarchy
+ return candidate;
+ else if (i > TOP_GENERAL_NAME) // Not a too general candidate
+ return candidate;
+ else if ((candIdx+1) < labels[freqIdx].candidatesCount){
+ //printf("Use another candidate \n");
+ return labels[freqIdx].candidates[candIdx+1];
+ }
+
+ //No choice
+ return candidate;
+
+}
+#endif
+
static
void buildLabelStat(LabelStat *labelStat, CSlabel *labels, CSset *freqCSset,
int k){
int i,j;
@@ -2826,7 +2858,11 @@ void buildLabelStat(LabelStat *labelStat
if (labels[i].name != BUN_NONE){
numCheck = (labels[i].candidatesCount >
k)?k:labels[i].candidatesCount;
for (j = 0; j < numCheck; j++){
+ #if USE_ALTERNATIVE_NAME
+ candidate = getMostSuitableName(labels, i, j);
+ #else
candidate = labels[i].candidates[j];
+ #endif
bun =
BUNfnd(BATmirror(labelStat->labelBat),(ptr) &candidate);
if (bun == BUN_NONE) {
/*New string*/
@@ -2874,7 +2910,11 @@ void buildLabelStat(LabelStat *labelStat
if (labels[i].name != BUN_NONE){
numCheck = (labels[i].candidatesCount >
k)?k:labels[i].candidatesCount;
for (j = 0; j < numCheck; j++){
+ #if USE_ALTERNATIVE_NAME
+ candidate = getMostSuitableName(labels, i, j);
+ #else
candidate = labels[i].candidates[j];
+ #endif
bun =
BUNfnd(BATmirror(labelStat->labelBat),(ptr) &candidate);
if (bun == BUN_NONE) {
fprintf(stderr, "All the name should be
stored already!\n");
@@ -2903,18 +2943,63 @@ void freeLabelStat(LabelStat *labelStat)
free(labelStat);
}
+static
+void doMerge(CSset *freqCSset, int ruleNum, CS* cs1, CS* cs2, int freqId1, int
freqId2, oid *mergecsId, CSlabel** labels, oid** ontmetadata, int
ontmetadataCount, oid name){
+ CS *mergecs;
+ int existMergecsId;
+ CS *existmergecs, *mergecs1, *mergecs2;
+ int k;
+
+ //Check whether these CS's belong to any mergeCS
+ if (cs1->parentFreqIdx == -1 && cs2->parentFreqIdx == -1){ /* New
merge */
+ mergecs = mergeTwoCSs(*cs1,*cs2, freqId1,freqId2, *mergecsId);
+ //addmergeCStoSet(mergecsSet, *mergecs);
+ cs1->parentFreqIdx = freqCSset->numCSadded;
+ cs2->parentFreqIdx = freqCSset->numCSadded;
+ addCStoSet(freqCSset,*mergecs);
+ updateLabel(ruleNum, freqCSset, labels, 1,
freqCSset->numCSadded - 1, freqId1, freqId2, name, ontmetadata,
ontmetadataCount, NULL, -1);
+ free(mergecs);
+
+ mergecsId[0]++;
+ }
+ else if (cs1->parentFreqIdx == -1 && cs2->parentFreqIdx != -1){
+ existMergecsId = cs2->parentFreqIdx;
+ existmergecs = &(freqCSset->items[existMergecsId]);
+ mergeACStoExistingmergeCS(*cs1,freqId1, existmergecs);
+ cs1->parentFreqIdx = existMergecsId;
+ updateLabel(ruleNum, freqCSset, labels, 0, existMergecsId,
freqId1, freqId2, name, ontmetadata, ontmetadataCount, NULL, -1);
+ }
+
+ else if (cs1->parentFreqIdx != -1 && cs2->parentFreqIdx == -1){
+ existMergecsId = cs1->parentFreqIdx;
+ existmergecs = &(freqCSset->items[existMergecsId]);
+ mergeACStoExistingmergeCS(*cs2,freqId2, existmergecs);
+ cs2->parentFreqIdx = existMergecsId;
+ updateLabel(ruleNum, freqCSset, labels, 0, existMergecsId,
freqId1, freqId2, name, ontmetadata, ontmetadataCount, NULL, -1);
+ }
+ else if (cs1->parentFreqIdx != cs2->parentFreqIdx){
+ mergecs1 = &(freqCSset->items[cs1->parentFreqIdx]);
+ mergecs2 = &(freqCSset->items[cs2->parentFreqIdx]);
+
+ mergeTwomergeCS(mergecs1, mergecs2, cs1->parentFreqIdx);
+
+ //Re-map for all maxCS in mergecs2
+ for (k = 0; k < mergecs2->numConsistsOf; k++){
+
freqCSset->items[mergecs2->lstConsistsOf[k]].parentFreqIdx = cs1->parentFreqIdx;
+ }
+ updateLabel(ruleNum, freqCSset, labels, 0, cs1->parentFreqIdx,
freqId1, freqId2, name, ontmetadata, ontmetadataCount, NULL, -1);
+ }
+
+}
static
-void mergeMaxFreqCSByS1(CSset *freqCSset, CSlabel** labels, oid *mergecsId,
oid** ontmetadata, int ontmetadataCount){
+str mergeMaxFreqCSByS1(CSset *freqCSset, CSlabel** labels, oid *mergecsId,
oid** ontmetadata, int ontmetadataCount){
int i;
#if !USE_MULTIWAY_MERGING
- int j,k;
+ int j, k;
int freqId1, freqId2;
- CS *mergecs;
- int existMergecsId;
CS *cs1, *cs2;
- CS *existmergecs, *mergecs1, *mergecs2;
#else
int *lstDistinctFreqId = NULL;
int numDistinct = 0;
@@ -2923,6 +3008,17 @@ void mergeMaxFreqCSByS1(CSset *freqCSset
#endif
LabelStat *labelStat = NULL;
oid *name;
+ #if OUTPUT_FREQID_PER_LABEL
+ FILE *fout;
+ char* schema = "rdf";
+ int ret = 0;
+ str tmpLabel;
+ int tmpCount;
+
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list