Changeset: 782ccaa7dff9 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=782ccaa7dff9
Modified Files:
        monetdb5/extras/rdf/rdfschema.c
        monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:

change FullSampleData to print only 8 columns and add a file that contains the 
"solutions" (ordered candidates)


diffs (truncated from 369 to 300 lines):

diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -5507,9 +5507,15 @@ str initFullSampleData(CSSampleExtend *c
                csSampleEx[i].name = cstablestat->lstcstable[i].tblname; 
                csSampleEx[i].candidateCount = tmpNumcand;
                csSampleEx[i].candidates = (oid*)malloc(sizeof(oid) * 
tmpNumcand); 
+               csSampleEx[i].candidatesOrdered = (oid*)malloc(sizeof(oid) * 
tmpNumcand); 
                for (k = 0; k < tmpNumcand; k++){
                        csSampleEx[i].candidates[k] = 
label[freqId].candidates[k]; 
-               }
+                       csSampleEx[i].candidatesOrdered[k] = 
label[freqId].candidates[k]; 
+               }
+               csSampleEx[i].candidatesNew = label[freqId].candidatesNew;
+               csSampleEx[i].candidatesOntology = 
label[freqId].candidatesOntology;
+               csSampleEx[i].candidatesType = label[freqId].candidatesType;
+               csSampleEx[i].candidatesFK = label[freqId].candidatesFK;
                //Randomly exchange the value, change the position k with a 
random pos
                for (k = 0; k < tmpNumcand; k++){
                        randValue = rand() % tmpNumcand;
@@ -5650,6 +5656,7 @@ void freeSampleExData(CSSampleExtend *cs
                free(csSampleEx[i].lstIsInfrequentProp);
                free(csSampleEx[i].lstIsMVCol);
                free(csSampleEx[i].candidates); 
+               free(csSampleEx[i].candidatesOrdered); 
                free(csSampleEx[i].lstSubjOid);
                for (j = 0; j < csSampleEx[i].numProp; j++){
                        BBPunfix(csSampleEx[i].colBats[j]->batCacheid);
@@ -6184,11 +6191,11 @@ str printSampleData(CSSample *csSample, 
 
 #if NO_OUTPUTFILE == 0
 static 
-str printFullSampleData(CSSampleExtend *csSampleEx, int num, BAT *mbat){
+str printFullSampleData(CSSampleExtend *csSampleEx, int num, BAT *mbat, 
PropStat *propStat, CSset *freqCSset){
 
        int     i,j, k; 
-       FILE    *fout, *fouttb, *foutis; 
-       char    filename[100], filename2[100], filename3[100];
+       FILE    *fout, *foutsol, *fouttb, *foutis; 
+       char    filename[100], filename4[100], filename2[100], filename3[100];
        int     ret;
 
        str     propStr; 
@@ -6216,6 +6223,12 @@ str printFullSampleData(CSSampleExtend *
        str     propStrShort = NULL;
        char    *pch; 
 #endif
+       int*    propOrder;
+       int*    propOrderTfidf;
+       float*  tfidfValues;
+       int     numPropsInSampleTable;
+       int     found = 0;
+       CS      freqCS;
 
 
        mapi = bat_iterator(mbat);
@@ -6227,6 +6240,9 @@ str printFullSampleData(CSSampleExtend *
 
        strcpy(filename, "sampleDataFull");
        strcat(filename, ".txt");
+
+       strcpy(filename4, "sampleDataFullSolution");
+       strcat(filename4, ".txt");
        
        strcpy(filename2, "createSampleTableFull");
        strcat(filename2, ".sh");
@@ -6235,12 +6251,15 @@ str printFullSampleData(CSSampleExtend *
        strcat(filename3, ".sh");
        
        fout = fopen(filename,"wt"); 
+       foutsol = fopen(filename4,"wt");
        fouttb = fopen(filename2,"wt");
        foutis = fopen(filename3,"wt");
 
        for (i = 0; i < num; i++){
                sample = csSampleEx[i];
-               fprintf(fout,"Sample table %d Candidates: ", i);
+               freqCS = freqCSset->items[sample.freqIdx];
+               fprintf(fout,"Table %d\n", i);
+               fprintf(foutsol, "Table %d\n", i);
                for (j = 0; j < (int)sample.candidateCount; j++){
                        //fprintf(fout,"  "  BUNFMT,sample.candidates[j]);
                        if (sample.candidates[j] != BUN_NONE){
@@ -6251,18 +6270,43 @@ str printFullSampleData(CSSampleExtend *
                                getStringName(sample.candidates[j], &canStr, 
mapi, mbat, 1);
 #if USE_SHORT_NAMES
                                getPropNameShort(&canStrShort, canStr);
-                               fprintf(fout,";%s",  canStrShort);
+                               if (j+1 == (int)sample.candidateCount) 
fprintf(fout, "%s",  canStrShort);
+                               else fprintf(fout, "%s;", canStrShort);
                                GDKfree(canStrShort);
 #else
-                               fprintf(fout,";%s",  canStr);
+                               if (j+1 == (int)sample.candidateCount) 
fprintf(fout, "%s",  canStr);
+                               else fprintf(fout, "%s;", canStr);
+
 #endif
                                GDKfree(canStr); 
                        
                        }
+                       // ordered candidates for solution
+                       if (sample.candidatesOrdered[j] != BUN_NONE){
+#if USE_SHORT_NAMES
+                               str canStrShort = NULL;
+#endif
+                               getStringName(sample.candidatesOrdered[j], 
&canStr, mapi, mbat, 1);
+#if USE_SHORT_NAMES
+                               getPropNameShort(&canStrShort, canStr);
+                               if (j+1 == (int)sample.candidateCount) 
fprintf(foutsol, "%s (%s)",  canStrShort, canStr);
+                               else fprintf(foutsol, "%s (%s);", canStrShort, 
canStr);
+                               GDKfree(canStrShort);
+#else
+                               if (j+1 == (int)sample.candidateCount) 
fprintf(foutsol, "%s",  canStr);
+                               else fprintf(foutsol, "%s;", canStr);
+
+#endif
+                               GDKfree(canStr); 
+                       
+                       }
                }
                fprintf(fout, "\n");
+               fprintf(foutsol, "\n");
+
+               // print origin of candidates for solutions file
+               fprintf(foutsol, "New: %d, Type %d, Ontology %d, FK %d\n", 
sample.candidatesNew, sample.candidatesType, sample.candidatesOntology, 
sample.candidatesFK);
                
-
                if (sample.name != BUN_NONE){
                        str canStrShort = NULL;
                        //takeOid(sample.name, &canStr);
@@ -6289,6 +6333,80 @@ str printFullSampleData(CSSampleExtend *
                else 
                        fprintf(fouttb,"CREATE TABLE tbSample%d \n (\n", i);
 
+               //Number of tuples
+               fprintf(fout, "%d\n", freqCS.support);
+
+               // Compute property order (descending by support) and number of 
properties that are printed
+               found = 0;
+               numPropsInSampleTable = 
(sample.numProp>(1+NUM_PROP_SUPPORT_SAMPLE+NUM_PROP_TFIDF_SAMPLE))?(1+NUM_PROP_SUPPORT_SAMPLE+NUM_PROP_TFIDF_SAMPLE):sample.numProp;
+               propOrder = GDKmalloc(sizeof(int) * sample.numProp);
+               propOrderTfidf = GDKmalloc(sizeof(int) * sample.numProp);
+               tfidfValues = GDKmalloc(sizeof(float) * sample.numProp);
+               for (j = 0; j < sample.numProp; ++j) {
+                       propOrder[j] = j;
+                       propOrderTfidf[j] = j;
+               }
+
+               // To get the top <NUM_PROP_SUPPORT_SAMPLE> properties, sort 
all properties descending by support.
+               // The "subject" column remains at the first position 
regardless of its support.
+               // Sort using insertion sort.
+               for (j = 2; j < sample.numProp; ++j) {
+                       int tmpPos = propOrder[j];
+                       int tmpVal = freqCS.lstPropSupport[tmpPos];
+                       int k = j - 1;
+                       while (k >= 1 && freqCS.lstPropSupport[propOrder[k]] < 
tmpVal) { // sort descending
+                               propOrder[k + 1] = propOrder[k];
+                               k--;
+                       }
+                       propOrder[k + 1] = tmpPos;
+               }
+
+               // To get the top <NUM_PROP_TFIDF_SAMPLE> properties, sort all 
properties descending by tf-idf score.
+               for (j = 1; j < sample.numProp; ++j) {
+                       float tfidf;
+                       BUN bun = BUNfnd(BATmirror(propStat->pBat),(ptr) 
&sample.lstProp[j]);
+                       if (bun == BUN_NONE) {
+                               printf("Error: property not found\n");
+                       } else {
+                               tfidf = propStat->tfidfs[bun];
+                       }
+                       tfidfValues[j] = tfidf;
+               }
+
+               // Sort using insertion sort. Ignore "subject" column
+               for (j = 2; j < sample.numProp; ++j) {
+                       int tmpPos = propOrderTfidf[j];
+                       float tmpVal = tfidfValues[tmpPos];
+                       int k = j - 1;
+                       while (k >= 1 && tfidfValues[propOrderTfidf[k]] < 
tmpVal) { // sort descending
+                               propOrderTfidf[k + 1] = propOrderTfidf[k];
+                               k--;
+                       }
+                       propOrderTfidf[k + 1] = tmpPos;
+               }
+
+               // Add <NUM_PROP_TFIDF_SAMPLE> properties to propOrder that 
have a high tfidf score but are not yet in the top 1+NUM_PROP_TFIDF_SAMPLE 
values of propOrder
+               for (j = 1; j < sample.numProp; ++j) {
+                       int prop, foundProp, bound;
+                       if (found == NUM_PROP_TFIDF_SAMPLE) break;
+                       prop = propOrderTfidf[j];
+                       // check if prop is already choosen
+                       foundProp = 0;
+                       bound = 
(1+NUM_PROP_SUPPORT_SAMPLE)>sample.numProp?sample.numProp:(1+NUM_PROP_SUPPORT_SAMPLE);
 //minimum
+                       for (k = 1; k < bound; ++k) {
+                               if (propOrder[k] == prop) {
+                                       foundProp = 1;
+                                       break;
+                               }
+                       }
+                       if (!foundProp) {
+                               // add prop to propOrder
+                               // overwriting values is okay because the 
original values at position >= (1+NUM_PROP_SUPPORT_SAMPLE) in propOrder are not 
needed anymore
+                               propOrder[1+NUM_PROP_SUPPORT_SAMPLE+found] = 
prop;
+                               found++;
+                       }
+               }
+
                //List of columns
                fprintf(fout,"Subject");
                fprintf(fouttb,"SubjectCol string");
@@ -6298,12 +6416,12 @@ str printFullSampleData(CSSampleExtend *
                isDescription = 0; 
                isImage = 0;
                isSite = 0; 
-               for (j = 0; j < sample.numProp; j++){
-                       if (sample.lstIsInfrequentProp[j] == 1) continue; 
+               for (j = 0; j < numPropsInSampleTable; j++){
+                       int index = propOrder[j]; // apply mapping to change 
order of properties
 #if USE_SHORT_NAMES
                        propStrShort = NULL;
 #endif
-                       takeOid(sample.lstProp[j], &propStr);   
+                       takeOid(sample.lstProp[index], &propStr);       
 #if USE_SHORT_NAMES
                        getPropNameShort(&propStrShort, propStr);
                        fprintf(fout,";%s", propStrShort);
@@ -6334,7 +6452,7 @@ str printFullSampleData(CSSampleExtend *
                                        strcmp(propStrShort,"fax_number") == 0 
||
                                        strcmp(propStrShort,"app_id") == 0 
                                        )
-                               fprintf(fouttb,",\n%s_%d 
string",propStrShort,j);
+                               fprintf(fouttb,",\n%s_%d 
string",propStrShort,index);
                        else
                                fprintf(fouttb,",\n%s string",propStrShort);
 
@@ -6357,14 +6475,6 @@ str printFullSampleData(CSSampleExtend *
                fprintf(fout, "\n");
                fprintf(fouttb, "\n); \n \n");
                
-               //List of support
-               for (j = 0; j < sample.numProp; j++){
-                       if (sample.lstIsInfrequentProp[j] == 1) continue;
-                       fprintf(fout,";%d", sample.lstPropSupport[j]);
-               }
-               fprintf(fout, "\n");
-       
-               
                fprintf(foutis, "echo \"");
                //All the instances 
                for (k = 0; k < sample.numInstances; k++){
@@ -6382,10 +6492,9 @@ str printFullSampleData(CSSampleExtend *
 #endif
                        GDKfree(subjStr); 
                        
-                       for (j = 0; j < sample.numProp; j++){
-                               if (sample.lstIsInfrequentProp[j] == 1) 
continue;                       
-
-                               tmpBat = sample.colBats[j];
+                       for (j = 0; j < numPropsInSampleTable; j++){
+                               int index = propOrder[j]; // apply mapping to 
change order of properties
+                               tmpBat = sample.colBats[index];
                                tmpi = bat_iterator(tmpBat);
                                
                                if (tmpBat->ttype == TYPE_oid){ //URI or BLANK 
NODE  or MVCol
@@ -6455,7 +6564,12 @@ str printFullSampleData(CSSampleExtend *
                        fprintf(foutis, "\n");
                }
 
+               GDKfree(propOrder);
+               GDKfree(propOrderTfidf);
+               GDKfree(tfidfValues);
+
                fprintf(fout, "\n");
+               fprintf(foutsol, "\n");
                fprintf(foutis, "\" > tmp.txt \n \n");
 
                if (sample.name != BUN_NONE){
@@ -6491,6 +6605,7 @@ str printFullSampleData(CSSampleExtend *
        }
 
        fclose(fout);
+       fclose(foutsol);
        fclose(fouttb); 
        fclose(foutis); 
        
@@ -6970,7 +7085,7 @@ str getSampleData(int *ret, bat *mapbati
 
 #if NO_OUTPUTFILE == 0
 static
-str getFullSampleData(CStableStat* cstablestat, CSPropTypes *csPropTypes, int 
*mTblIdxFreqIdxMapping, CSlabel *labels, int numTables,  bat *lmapbatid, bat 
*rmapbatid, CSset *freqCSset, bat *mapbatid){
+str getFullSampleData(CStableStat* cstablestat, CSPropTypes *csPropTypes, int 
*mTblIdxFreqIdxMapping, CSlabel *labels, int numTables,  bat *lmapbatid, bat 
*rmapbatid, CSset *freqCSset, bat *mapbatid, PropStat *propStat){
 
        CSSampleExtend *csSampleEx;
        BAT *mbat = NULL; 
@@ -6982,7 +7097,7 @@ str getFullSampleData(CStableStat* cstab
        
        initFullSampleData(csSampleEx, mTblIdxFreqIdxMapping, labels, 
cstablestat, csPropTypes, freqCSset, numTables, lmapbatid, rmapbatid);
 
-       printFullSampleData(csSampleEx, numTables, mbat);
_______________________________________________
checkin-list mailing list
checkin-list@monetdb.org
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to