Changeset: c8f249486552 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=c8f249486552
Modified Files:
monetdb5/extras/rdf/rdflabels.c
monetdb5/extras/rdf/rdflabels.h
monetdb5/extras/rdf/rdfschema.c
monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:
Change the priority in assinging label to type-ontology-FK
Slightly change the TF/IDF computation
Disable S3 (Sub-super) and S5 (FK).
diffs (truncated from 444 to 300 lines):
diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -1472,6 +1472,7 @@ void createOntologyLookupResult(oid** re
// get class names
resultCount[i] = 0;
+
result[i] = getOntologyCandidates(ontattributes,
ontattributesCount, ontmetadata, ontmetadataCount, &(resultCount[i]),
propOntologiesOids, propOntologiesCount, ontologyCount, propStat);
for (j = 0; j < ontologyCount; ++j) {
@@ -2082,90 +2083,6 @@ void getTableName(CSlabel* label, int cs
(void) ontmetaBat;
- // --- ONTOLOGY ---
- // add all ontology candidates to list of candidates
- if (resultCount[csIdx] >= 1) {
- label->candidatesOntology = resultCount[csIdx];
- label->candidates = GDKrealloc(label->candidates, sizeof(oid) *
(label->candidatesCount + resultCount[csIdx]));
- if (!label->candidates) fprintf(stderr, "ERROR: Couldn't
realloc memory!\n");
- for (i = 0; i < resultCount[csIdx]; ++i) {
- label->candidates[label->candidatesCount + i] =
result[csIdx][i];
- }
- label->candidatesCount += resultCount[csIdx];
- }
-
- // one ontology class --> use it
- if (resultCount[csIdx] == 1) {
- label->name = result[csIdx][0];
- label->hierarchy = getOntoHierarchy(label->name,
&(label->hierarchyCount), ontmetadata, ontmetadataCount);
- nameFound = 1;
- #if INFO_WHERE_NAME_FROM
- label->isOntology = 1;
- #endif
- }
-
- if (!nameFound) {
- // multiple ontology classes --> intersect with types
- if (resultCount[csIdx] > 1) {
- tmpList = NULL;
- tmpListCount = 0;
- // search for type values
- for (i = 0; i < typeAttributesCount; ++i) {
- for (j = 0; j <
typeAttributesHistogramCount[csIdx][i]; ++j) {
- if
(typeAttributesHistogram[csIdx][i][j].percent < TYPE_FREQ_THRESHOLD) break; //
sorted
-
- // intersect type with ontology classes
- for (k = 0; k < resultCount[csIdx];
++k) {
- if (result[csIdx][k] ==
typeAttributesHistogram[csIdx][i][j].value) {
- // found, copy ontology
class to tmpList
- tmpList = (oid *)
realloc(tmpList, sizeof(oid) * (tmpListCount + 1));
- if (!tmpList)
fprintf(stderr, "ERROR: Couldn't realloc memory!\n");
- tmpList[tmpListCount] =
result[csIdx][k];
- tmpListCount += 1;
- }
- }
- }
- }
-
- // only one left --> use it
- if (tmpListCount == 1) {
- label->name = tmpList[0];
- label->hierarchy =
getOntoHierarchy(label->name, &(label->hierarchyCount), ontmetadata,
ontmetadataCount);
- free(tmpList);
- nameFound = 1;
- #if INFO_WHERE_NAME_FROM
- label->isOntology = 1;
- #endif
- }
-
- if (!nameFound) {
- // multiple left --> use the class that covers
most attributes, most popular ontology, ...
- if (tmpListCount > 1) {
- label->name = tmpList[0]; // sorted
- label->hierarchy =
getOntoHierarchy(label->name, &(label->hierarchyCount), ontmetadata,
ontmetadataCount);
- free(tmpList);
- nameFound = 1;
-
- #if INFO_WHERE_NAME_FROM
- label->isOntology = 1;
- #endif
- }
- }
-
- if (!nameFound) {
- // empty intersection -> use the class that
covers most attributes, most popular ontology, ..
- label->name = result[csIdx][0]; // sorted
- label->hierarchy =
getOntoHierarchy(label->name, &(label->hierarchyCount), ontmetadata,
ontmetadataCount);
- free(tmpList);
- nameFound = 1;
-
- #if INFO_WHERE_NAME_FROM
- label->isOntology = 1;
- #endif
- }
- }
- }
-
// --- TYPE ---
// get most frequent type value per type attribute
tmpList = NULL;
@@ -2204,7 +2121,7 @@ void getTableName(CSlabel* label, int cs
int freq;
if (typeAttributesHistogram[csIdx][i][j].percent <
TYPE_FREQ_THRESHOLD) break;
depth = ontclassSet[BUNfnd(BATmirror(ontmetaBat),
&typeAttributesHistogram[csIdx][i][j].value)].hierDepth;
- maxDepth = ontclassSet[BUNfnd(BATmirror(ontmetaBat),
&maxDepthOid)].hierDepth;;
+ maxDepth = ontclassSet[BUNfnd(BATmirror(ontmetaBat),
&maxDepthOid)].hierDepth;
freq = typeAttributesHistogram[csIdx][i][j].freq;
if (depth > maxDepth) {
// choose value with higher hierarchy level
@@ -2269,6 +2186,95 @@ void getTableName(CSlabel* label, int cs
}
}
+
+ // --- ONTOLOGY ---
+ // add all ontology candidates to list of candidates
+ if (resultCount[csIdx] >= 1) {
+ label->candidatesOntology = resultCount[csIdx];
+ label->candidates = GDKrealloc(label->candidates, sizeof(oid) *
(label->candidatesCount + resultCount[csIdx]));
+ if (!label->candidates) fprintf(stderr, "ERROR: Couldn't
realloc memory!\n");
+ for (i = 0; i < resultCount[csIdx]; ++i) {
+ label->candidates[label->candidatesCount + i] =
result[csIdx][i];
+ }
+ label->candidatesCount += resultCount[csIdx];
+ }
+
+ // one ontology class --> use it
+ if (!nameFound){
+ if (resultCount[csIdx] == 1) {
+ label->name = result[csIdx][0];
+ label->hierarchy = getOntoHierarchy(label->name,
&(label->hierarchyCount), ontmetadata, ontmetadataCount);
+ nameFound = 1;
+ #if INFO_WHERE_NAME_FROM
+ label->isOntology = 1;
+ #endif
+ }
+ }
+
+ if (!nameFound) {
+ // multiple ontology classes --> intersect with types
+ if (resultCount[csIdx] > 1) {
+ tmpList = NULL;
+ tmpListCount = 0;
+ // search for type values
+ for (i = 0; i < typeAttributesCount; ++i) {
+ for (j = 0; j <
typeAttributesHistogramCount[csIdx][i]; ++j) {
+ if
(typeAttributesHistogram[csIdx][i][j].percent < TYPE_FREQ_THRESHOLD) break; //
sorted
+
+ // intersect type with ontology classes
+ for (k = 0; k < resultCount[csIdx];
++k) {
+ if (result[csIdx][k] ==
typeAttributesHistogram[csIdx][i][j].value) {
+ // found, copy ontology
class to tmpList
+ tmpList = (oid *)
realloc(tmpList, sizeof(oid) * (tmpListCount + 1));
+ if (!tmpList)
fprintf(stderr, "ERROR: Couldn't realloc memory!\n");
+ tmpList[tmpListCount] =
result[csIdx][k];
+ tmpListCount += 1;
+ }
+ }
+ }
+ }
+
+ // only one left --> use it
+ if (tmpListCount == 1) {
+ label->name = tmpList[0];
+ label->hierarchy =
getOntoHierarchy(label->name, &(label->hierarchyCount), ontmetadata,
ontmetadataCount);
+ free(tmpList);
+ nameFound = 1;
+ #if INFO_WHERE_NAME_FROM
+ label->isOntology = 1;
+ #endif
+ }
+
+ if (!nameFound) {
+ // multiple left --> use the class that covers
most attributes, most popular ontology, ...
+ if (tmpListCount > 1) {
+ label->name = tmpList[0]; // sorted
+ label->hierarchy =
getOntoHierarchy(label->name, &(label->hierarchyCount), ontmetadata,
ontmetadataCount);
+ free(tmpList);
+ nameFound = 1;
+
+ #if INFO_WHERE_NAME_FROM
+ label->isOntology = 1;
+ #endif
+ }
+ }
+
+ if (!nameFound) {
+ // empty intersection -> use the class that
covers most attributes, most popular ontology, ..
+ label->name = result[csIdx][0]; // sorted
+ label->hierarchy =
getOntoHierarchy(label->name, &(label->hierarchyCount), ontmetadata,
ontmetadataCount);
+ free(tmpList);
+ nameFound = 1;
+
+ #if INFO_WHERE_NAME_FROM
+ label->isOntology = 1;
+ #endif
+ }
+ }
+ }
+
+
+
// --- FK ---
// add top3 fk values to list of candidates
if (links[csIdx].num > 0) {
diff --git a/monetdb5/extras/rdf/rdflabels.h b/monetdb5/extras/rdf/rdflabels.h
--- a/monetdb5/extras/rdf/rdflabels.h
+++ b/monetdb5/extras/rdf/rdflabels.h
@@ -92,7 +92,8 @@ enum {
#define FK_FREQ_THRESHOLD 25 // X % of the targeted subjects have to
be in this table
#define TYPE_FREQ_THRESHOLD 80 // X % of the type values have to be
this value
-#define ONTOLOGY_FREQ_THRESHOLD 0.4 // similarity threshold for tfidf
simularity for ontology classes
+//#define ONTOLOGY_FREQ_THRESHOLD 0.4 // similarity threshold for tfidf
simularity for ontology classes
+#define ONTOLOGY_FREQ_THRESHOLD 0.8 // similarity threshold for tfidf
simularity for ontology classes
#define USE_SHORT_NAMES 1 // use getPropNameShort()
#define USE_TYPE_NAMES 1 // use type attribute values for
labeling
diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -121,6 +121,26 @@ static void initcsIdFreqIdxMap(int* inpu
+str printTKNZStringFromOid(oid id){
+ int ret;
+ char* schema = "rdf";
+ str propStr;
+
+ if (TKNZRopen (NULL, &schema) != MAL_SUCCEED) {
+ throw(RDF, "rdf.rdfschema",
+ "could not open the tokenizer\n");
+ }
+
+ takeOid(id, &propStr);
+ printf("String for "BUNFMT": %s\n", id, propStr);
+
+ GDKfree(propStr);
+ TKNZRclose(&ret);
+
+ return MAL_SUCCEED;
+}
+
+
char isCSTable(CS item){
if (item.parentFreqIdx != -1) return 0;
@@ -2507,6 +2527,11 @@ oid putaCStoHash(CSBats *csBats, oid* ke
csId = *csoid;
addNewCS(csBats, fullPropStat, &csKey, key, csoid, num,
numTriples, numTypeValues, rdftypeOntologyValues);
+ //if (csId == 73){
+ // printf("Extra info for cs 73 is: ");
+ // printTKNZStringFromOid(rdftypeOntologyValues[0]);
+ //}
+
//Handle the case when freqThreshold == 1
if (isStoreFreqCS ==1 && freqThreshold == 1){
#if STOREFULLCS
@@ -2612,7 +2637,10 @@ static int isSubset(oid* arr1, oid* arr2
* See http://disi.unitn.it/~bernardi/Courses/DL/Slides_11_12/measures.pdf
* tf(t,d): Number of times t occurs in d. --> For a CS, tf(prop, aCS) = 1;
* idf(t): The rarity of a term t in the whold document collection
- * idf(t) = log(#totalNumOfCSs / #numberCSs_containing_t +1)
+ * idf(t) = log(#totalNumOfCSs / #numberCSs_containing_t)
+ * Note that, some function may use #numberCSs_containing_t + 1 as it can be
division
+ * by 0 if the term does not appear in any document. However, in our case,
+ * every prop must appear in at least one CS
* tf-idf(t,d,D) = tf(t,d) * idf(t,D)
*
* Note that: If we use normalize tf by dividing with maximum tf
@@ -2621,7 +2649,7 @@ static int isSubset(oid* arr1, oid* arr2
static
float tfidfComp(int numContainedCSs, int totalNumCSs){
- return log((float)totalNumCSs/(1+numContainedCSs));
+ return log((float)totalNumCSs/(numContainedCSs));
}
/*
@@ -3014,15 +3042,33 @@ void getPropStatisticsFromMergeCSs(PropS
for (i = 0; i < propStat->numAdded; i++){
propStat->tfidfs[i] =
tfidfComp(propStat->freqs[i],curNumMergeCS);
+
}
//BATprint(propStat->pBat);
/*
+ {
+ int ret;
+ char* schema = "rdf";
+ str propStr;
+
+ if (TKNZRopen (NULL, &schema) != MAL_SUCCEED) {
+ printf("Fail in opening Tokenizer \n");
+ }
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list