Changeset: 0d988e82a42b for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=0d988e82a42b
Modified Files:
        monetdb5/extras/rdf/rdflabels.c
        monetdb5/extras/rdf/rdflabels.h
        monetdb5/extras/rdf/rdfschema.c
        monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:

Correct the tf-idf computation in rdflabels.c

This needs to be verified by Linnea.


diffs (266 lines):

diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
--- a/monetdb5/extras/rdf/rdflabels.c
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -1074,7 +1074,7 @@ int compareOntologyCandidates (const voi
 #if USE_ONTOLOGY_NAMES
 /* For one CS: Calculate the ontology classes that are similar (tfidf) to the 
list of attributes. */
 static
-oid* getOntologyCandidates(oid** ontattributes, int ontattributesCount, oid** 
ontmetadata, int ontmetadataCount, int *resultCount, oid **listOids, int 
*listCount, int listNum, PropStat *propStat) {
+oid* getOntologyCandidates(oid** ontattributes, int ontattributesCount, oid** 
ontmetadata, int ontmetadataCount, int *resultCount, oid **listOids, int 
*listCount, int listNum, PropStat *propStat, float *totaltfidfsPerOntology) {
        int             i, j, k, l;
        oid             *result = NULL;
 
@@ -1095,7 +1095,7 @@ oid* getOntologyCandidates(oid** ontattr
                        candidates[j] = NULL;
                        candidatesCount[j] = 0;
                }
-
+               //printf("Number of attribute in corresponding ontology is: %d 
\n", ontattributesCount);
                for (j = 0; j < ontattributesCount; ++j) {
                        oid auri = ontattributes[0][j];
                        oid aattr = ontattributes[1][j];
@@ -1143,11 +1143,27 @@ oid* getOntologyCandidates(oid** ontattr
                                        classStat = (ClassStat *) 
realloc(classStat, sizeof(ClassStat) * (num + 1));
                                        if (!classStat) fprintf(stderr, "ERROR: 
Couldn't realloc memory!\n");
                                        classStat[num].ontoClass = 
candidates[j][k]; // pointer, no copy
+                                       classStat[num].totaltfidfs = 0.0;
                                        classStat[num].tfidfs = 
(propStat->tfidfs[bun] * propStat->tfidfs[bun]);
                                        num += 1;
                                }
                        }
                }
+               
+               //[DUC --- add the total tfidf score for a ontology class]  
//TODO: Compute before, not here
+               for (l = 0; l < num; ++l){
+                       for (j = 0; j < ontmetadataCount; ++j) {
+                               oid auri = ontmetadata[0][j];
+                               //printf("auri = " BUNFMT "\n", auri);
+                               if (auri == classStat[l].ontoClass){
+                                       //printf("Classstat %d (uri: "BUNFMT ") 
- Set totaltfidf with ontology %dth: %f \n", l, auri, j, 
totaltfidfsPerOntology[j]); 
+                                       classStat[l].totaltfidfs = 
totaltfidfsPerOntology[j]; 
+                                       break; 
+                               }
+                       }
+               }
+               //[ ... DUC]
+
 
                // calculate optimal tfidf score (all properties) & normalize 
tfidf sums
                totalTfidfs = 0.0;
@@ -1158,7 +1174,11 @@ oid* getOntologyCandidates(oid** ontattr
                        totalTfidfs += (propStat->tfidfs[bun] * 
propStat->tfidfs[bun]);
                }
                for (j = 0; j < num; ++j) {
-                       classStat[j].tfidfs /= totalTfidfs;
+                       //classStat[j].tfidfs /= totalTfidfs;  //[DUC--modify]
+                       //printf("original classStat[j].tfidfs = %f \n", 
classStat[j].tfidfs);
+                       classStat[j].tfidfs = classStat[j].tfidfs / 
(sqrt(totalTfidfs)*sqrt(classStat[j].totaltfidfs));
+                       //printf("totalTfidfs = %f    || 
classStat[j].totaltfidfs =  %f || classStat[j].tfidfs = %f 
\n",totalTfidfs,classStat[j].totaltfidfs,classStat[j].tfidfs);
+                       
                }
 
                // sort by tfidf desc
@@ -1167,7 +1187,7 @@ oid* getOntologyCandidates(oid** ontattr
                // remove subclass if superclass is in list
                for (k = 0; k < num; ++k) {
                        int found = 0;
-                       printf("    TFIDF score at %d is: %f \n",k, 
classStat[k].tfidfs);
+                       //printf("    TFIDF score at %d is: %f  \n",k, 
classStat[k].tfidfs);
                        if (classStat[k].tfidfs < ONTOLOGY_FREQ_THRESHOLD) 
break; // values not frequent enough (list is sorted by tfidfs)
                        for (j = 0; j < ontmetadataCount && (found == 0); ++j) {
                                oid muri = ontmetadata[0][j];
@@ -1274,6 +1294,7 @@ PropStat* initPropStat(void) {
 
 #if USE_ONTOLOGY_NAMES
 /* Copied from Duc's code. */
+/*
 static
 void createPropStatistics(PropStat* propStat, int numMaxCSs, CSset* freqCSset) 
{
        int             i, j;
@@ -1310,6 +1331,44 @@ void createPropStatistics(PropStat* prop
                propStat->tfidfs[i] = log(((float)numMaxCSs) / (1 + 
propStat->freqs[i]));
        }
 }
+*/
+//[DUC] Create propstat for ontology only 
+static
+void createPropStatistics(PropStat* propStat, oid** ontattributes, int 
ontattributesCount) {
+       int             i;
+
+       for (i = 0; i < ontattributesCount; ++i) {
+               oid attr = ontattributes[1][i];
+               // add prop to propStat
+               BUN     bun = BUNfnd(BATmirror(propStat->pBat), (ptr) &attr);
+               if (bun == BUN_NONE) {
+                       if (propStat->pBat->T->hash && BATcount(propStat->pBat) 
> 4 * propStat->pBat->T->hash->mask) {
+                               HASHdestroy(propStat->pBat);
+                               BAThash(BATmirror(propStat->pBat), 
2*BATcount(propStat->pBat));
+                       }
+
+                       propStat->pBat = BUNappend(propStat->pBat, &attr, TRUE);
+
+                       if (propStat->numAdded == propStat->numAllocation) {
+                               propStat->numAllocation += INIT_PROP_NUM;
+
+                               propStat->freqs = realloc(propStat->freqs, 
((propStat->numAllocation) * sizeof(int)));
+                               propStat->tfidfs = realloc(propStat->tfidfs, 
((propStat->numAllocation) * sizeof(float)));
+                               if (!propStat->freqs || !propStat->tfidfs) 
{fprintf(stderr, "ERROR: Couldn't realloc memory!\n");}
+                       }
+                       propStat->freqs[propStat->numAdded] = 1;
+                       propStat->numAdded++;
+               } else {
+                       propStat->freqs[bun]++;
+               }
+       }
+
+       for (i = 0; i < propStat->numAdded; ++i) {
+               propStat->tfidfs[i] = log(((float)ontattributesCount) / (1 + 
propStat->freqs[i]));
+       }
+}
+
+//... [DUC]
 #endif
 
 #if USE_ONTOLOGY_NAMES
@@ -1329,9 +1388,43 @@ static
 void createOntologyLookupResult(oid** result, CSset* freqCSset, int* 
resultCount, oid** ontattributes, int ontattributesCount, oid** ontmetadata, 
int ontmetadataCount) {
        int             i, j;
        PropStat        *propStat;
+       float*          totaltfidfsPerOntology;         //[DUC]
+       oid             lastUri; 
 
        propStat = initPropStat();
-       createPropStatistics(propStat, freqCSset->numCSadded, freqCSset);
+
+       //[DUC] Change the function for getting propStat. Use ontattributes for 
the propStat. 
+       // Not the properties from freqCS
+       //createPropStatistics(propStat, freqCSset->numCSadded, freqCSset);
+       createPropStatistics(propStat, ontattributes, ontattributesCount);
+       
+
+       lastUri = BUN_NONE; 
+       totaltfidfsPerOntology = (float*) malloc(sizeof(float) * 
ontmetadataCount);
+       //printf("Init tfidf for all %d ontologies \n",ontmetadataCount );
+       for (i = 0; i < ontmetadataCount; ++i) {
+               oid auri = ontmetadata[0][i];
+
+               if (auri == lastUri){ 
+                       //printf("Duplication at %d value " BUNFMT "\n", i, 
auri); 
+                       continue; 
+               }
+               else lastUri = auri; 
+               totaltfidfsPerOntology[i] = 0; 
+
+               for (j = 0; j < ontattributesCount; j++){
+                       oid tmpuri = ontattributes[0][j];
+                       oid aattr = ontattributes[1][j];
+                       if (auri == tmpuri){
+                               BUN bun = BUNfnd(BATmirror(propStat->pBat), 
(ptr) &aattr);
+                               if (bun == BUN_NONE) printf("[Debug] This 
cannot happen \n");
+                               else
+                                       totaltfidfsPerOntology[i] += 
(propStat->tfidfs[bun] * propStat->tfidfs[bun]);
+                       }       
+               }
+               //printf("Computed totaltfidfsPerOntology of ontology %d: %f 
(uri = "BUNFMT")\n",i, totaltfidfsPerOntology[i],auri);
+       }
+       //... [DUC]
 
        for (i = 0; i < freqCSset->numCSadded; ++i) {
                CS              cs;
@@ -1347,8 +1440,8 @@ void createOntologyLookupResult(oid** re
                for (j = 0; j < ontologyCount; ++j) {
                        propOntologiesCount[j] = 0;
                }
-
-               printf("Get ontology for FreqId %d. Orignal numProp = %d \n", 
i, cs.numProp);
+               
+               //printf("Get ontology for FreqId %d. Orignal numProp = %d \n", 
i, cs.numProp);
 
                propOntologies = findOntologies(cs, propOntologiesCount, 
&propOntologiesOids);
 
@@ -1356,13 +1449,13 @@ void createOntologyLookupResult(oid** re
                printf("Prop ontologies count. \n");
                for (j = 0; j < ontologyCount; ++j) {
                        if (propOntologiesCount[j] > 0)
-                               printf("    (%d) props in ontology %d \n ", 
propOntologiesCount[j], j);
+                               printf("    %d props in ontology %d \n ", 
propOntologiesCount[j], j);
                }
                */
 
                // get class names
                resultCount[i] = 0;
-               result[i] = getOntologyCandidates(ontattributes, 
ontattributesCount, ontmetadata, ontmetadataCount, &(resultCount[i]), 
propOntologiesOids, propOntologiesCount, ontologyCount, propStat);
+               result[i] = getOntologyCandidates(ontattributes, 
ontattributesCount, ontmetadata, ontmetadataCount, &(resultCount[i]), 
propOntologiesOids, propOntologiesCount, ontologyCount, 
propStat,totaltfidfsPerOntology);
 
                for (j = 0; j < ontologyCount; ++j) {
                        free(propOntologies[j]);
@@ -1373,6 +1466,7 @@ void createOntologyLookupResult(oid** re
                free(propOntologiesCount);
        }
        freePropStat(propStat);
+       free(totaltfidfsPerOntology);
 }
 #endif
 
diff --git a/monetdb5/extras/rdf/rdflabels.h b/monetdb5/extras/rdf/rdflabels.h
--- a/monetdb5/extras/rdf/rdflabels.h
+++ b/monetdb5/extras/rdf/rdflabels.h
@@ -60,6 +60,7 @@ typedef struct IncidentFKs {
 typedef struct ClassStat {
        oid             ontoClass;      // URI of the ontology class
        float           tfidfs;         // summarized tfidf score of all 
properties that accur in the ontology class
+       float           totaltfidfs;    // The total tfidf score for all 
properties of this ontology class
 } ClassStat;
 
 // Statistics for a type attribute value
diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -4464,6 +4464,9 @@ str printSampleData(CSSample *csSample, 
        oid     objOid = BUN_NONE; 
        BATiter mapi;
        str     canStr; 
+#if USE_SHORT_NAMES
+       str     propStrShort = NULL;
+#endif
 
        mapi = bat_iterator(mbat);
 
@@ -4506,8 +4509,9 @@ str printSampleData(CSSample *csSample, 
                //List of columns
                fprintf(fout,"Subject");
                for (j = 0; j < sample.numProp; j++){
+                       if (freqCS.lstPropSupport[j] * 100 < freqCS.support * 
SAMPLE_FILTER_THRESHOLD) continue; 
 #if USE_SHORT_NAMES
-                       str propStrShort = NULL;
+                       propStrShort = NULL;
 #endif
                        takeOid(sample.lstProp[j], &propStr);   
 #if USE_SHORT_NAMES
@@ -4523,6 +4527,7 @@ str printSampleData(CSSample *csSample, 
                
                //List of support
                for (j = 0; j < sample.numProp; j++){
+                       if (freqCS.lstPropSupport[j] * 100 < freqCS.support * 
SAMPLE_FILTER_THRESHOLD) continue; 
                        fprintf(fout,";%d", freqCS.lstPropSupport[j]);
                }
                fprintf(fout, "\n");
@@ -4543,6 +4548,7 @@ str printSampleData(CSSample *csSample, 
                        GDKfree(subjStr); 
                        
                        for (j = 0; j < sample.numProp; j++){
+                               if (freqCS.lstPropSupport[j] * 100 < 
freqCS.support * SAMPLE_FILTER_THRESHOLD) continue; 
                                objOid = sample.lstObj[j][k];
                                if (objOid == BUN_NONE)
                                        fprintf(fout,";NULL");
diff --git a/monetdb5/extras/rdf/rdfschema.h b/monetdb5/extras/rdf/rdfschema.h
--- a/monetdb5/extras/rdf/rdfschema.h
+++ b/monetdb5/extras/rdf/rdfschema.h
@@ -179,6 +179,7 @@ typedef struct SubCSSet{
 #define MIN_FROMTABLE_SIZE_S6 100  // The minimum size of the "from" table in 
S6. Meaning that 
                                    // the CS's to-be-merged in this rule must 
cover > MIN_FROMTABLE_SIZE_S6 / MIN_PERCETAGE_S6 triples
 #define MINIMUM_TABLE_SIZE 10000   //The minimum number of triples coverred by 
a table (i.e., a final CS) 
+#define SAMPLE_FILTER_THRESHOLD 1  // SAMPLE_FILTER_THRESHOLD/ 100     
 
 typedef struct CSset{
        CS* items;
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to