Changeset: 1f98cd0cb212 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=1f98cd0cb212
Modified Files:
        monetdb5/extras/rdf/rdfschema.c
        monetdb5/extras/rdf/rdfschema.h
        sql/backends/monet5/sql.mx
Branch: rdf
Log Message:

Generate type-specific relational tables + get the name for each table.

Bug: Name assigned to tables has some problems?.


diffs (truncated from 443 to 300 lines):

diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -4548,6 +4548,45 @@ void getObjStr(BAT *mapbat, BATiter mapi
 
 
 }
+
+//Assume Tokenizer is openned 
+//
+void getTblName(char *name, oid nameId){
+       str canStr = NULL; 
+       str canStrShort = NULL;
+       char    *pch;
+
+       if (nameId != BUN_NONE){
+               takeOid(nameId, &canStr);
+               getPropNameShort(&canStrShort, canStr);
+
+               if (strstr (canStrShort,".") != NULL || 
+                       strcmp(canStrShort,"") == 0 || 
+                       strstr(canStrShort,"-") != NULL ){      // WEBCRAWL 
specific problem with Table name a.jpg, b.png....
+
+                       strcpy(name,"NONAME");
+               }
+               else {
+                       pch = strstr (canStrShort,"(");
+                       if (pch != NULL) *pch = '\0';   //Remove (...) 
characters from table name
+               }
+
+               GDKfree(canStr);
+               if (strlen(canStrShort) < 50){
+                       strcpy(name,canStrShort);
+               }
+               else{
+                       strncpy (name, canStrShort, 50);
+               }
+
+               GDKfree(canStrShort); 
+       }
+       else 
+               strcpy(name,"NONAME");
+
+
+}
+
 static 
 str printSampleData(CSSample *csSample, CSset *freqCSset, BAT *mbat, int num){
 
@@ -4931,6 +4970,35 @@ CSrel* generateCsRelBetweenMergeFreqSet(
 }
 
 
+/* Create a new data structure to store relationships including merged CS */
+/*
+static
+CSrel* getRefinedCsRelSet(CSrel *csrelFreqSet, CSset *freqCSset){
+       int     i,j;
+       int     numFreqCS = freqCSset->numOrigFreqCS; 
+       int     from, to;
+       CSrel   rel;
+       CSrel*  refinedCsRel;
+       int     numRel = freqCSset->numCSadded;
+       
+       refinedCsRel = initCSrelset(numRel);
+
+       for (i = 0; i < numRel; ++i) {
+               if (csrelFreqSet[i].numRef == 0) continue; // ignore CS without 
relations
+               rel = csrelFreqSet[i];
+               // update the 'from' value
+               from = i;
+               assert(freqCSset[from].parentFreqIdx == -1);
+               for (j = 0; j < rel.numRef; ++j) {
+                       to = rel.lstRefFreqIdx[j];
+                       assert(freqCSset->items[to].parentFreqIdx == -1);
+                       // add relation to new data structure
+                       addReltoCSRelWithFreq(from, to, rel.lstPropId[j], 
rel.lstCnt[j], rel.lstBlankCnt[j], &refinedCsRel[from]);
+               }
+       }
+       return refinedCsRel;
+}
+*/
 static
 CSrel* generateCsRelToMergeFreqSet(CSrel *csrelFreqSet, CSset *freqCSset){
        int     i,j;
@@ -5263,8 +5331,13 @@ RDFextractCSwithTypes(int *ret, bat *sba
        
        //Finally, re-create mergeFreqSet
        
+       
        *csRelMergeFreqSet = generateCsRelBetweenMergeFreqSet(csrelSet, 
freqCSset);
        printCSRel(freqCSset, *csRelMergeFreqSet, *freqThreshold);
+       
+       curT = clock(); 
+       printf ("Get the final relationships between mergeCS took %f. 
\n",((float)(curT - tmpLastT))/CLOCKS_PER_SEC);   
+       tmpLastT = curT;                
 
        printmergeCSSet(freqCSset, *freqThreshold);
        //getStatisticCSsBySize(csMap,maxNumProp); 
@@ -5458,7 +5531,7 @@ str triplesubsort(BAT **sbat, BAT **pbat
 }
 
 static
-void initCStables(CStableStat* cstablestat, CSset* freqCSset, CSPropTypes 
*csPropTypes, int numTables){
+void initCStables(CStableStat* cstablestat, CSset* freqCSset, CSPropTypes 
*csPropTypes, int numTables, CSlabel *labels, int *mTblIdxFreqIdxMapping){
 
        int             i,j, k; 
        int             tmpNumDefaultCol; 
@@ -5509,11 +5582,13 @@ void initCStables(CStableStat* cstablest
                cstablestat->lstcstable[i].lstMVTables = (CSMVtableEx *) 
malloc(sizeof(CSMVtableEx) * tmpNumDefaultCol); // TODO: Only allocate memory 
for multi-valued columns
                cstablestat->lstcstable[i].lstProp = (oid*)malloc(sizeof(oid) * 
tmpNumDefaultCol);
                cstablestat->lstcstable[i].colTypes = (ObjectType 
*)malloc(sizeof(ObjectType) * tmpNumDefaultCol);
+               cstablestat->lstcstable[i].tblname = 
labels[mTblIdxFreqIdxMapping[i]].name;
                #if CSTYPE_TABLE == 1
                tmpNumExCol = csPropTypes[i].numNonDefTypes; 
                cstablestat->lastInsertedSEx[i] = (oid*) malloc(sizeof(oid) * 
tmpNumExCol); 
                cstablestat->lstcstableEx[i].numCol = tmpNumExCol;
                cstablestat->lstcstableEx[i].colBats = 
(BAT**)malloc(sizeof(BAT*) * tmpNumExCol); 
+               cstablestat->lstcstableEx[i].tblname = 
labels[mTblIdxFreqIdxMapping[i]].name;
                #endif
 
                for(j = 0; j < tmpNumDefaultCol; j++){
@@ -5780,33 +5855,33 @@ void getRealValue(void **returnValue, oi
        switch (objType)
        {
                case STRING:
-                       printf("A String object value: %s \n",objStr);
+                       //printf("A String object value: %s \n",objStr);
                        if (*returnValue != NULL) free(*returnValue);
                        *returnValue = (char *)malloc(sizeof(char) * 
strlen(objStr) + 1); 
                        memcpy(*returnValue,objStr, sizeof(char) * 
strlen(objStr) + 1);
-                       printf("A String value of returnValue: %s \n", (char 
*)(*returnValue));
+                       //printf("A String value of returnValue: %s \n", (char 
*)(*returnValue));
                        break; 
                case DATETIME:
                        datetimeStr = getDateTimeFromRDFString(objStr);
                        if (*returnValue != NULL) free(*returnValue);
                        *returnValue = (char *)malloc(sizeof(char) * 
strlen(datetimeStr) + 1); 
                        memcpy(*returnValue,datetimeStr,sizeof(char) * 
strlen(objStr) + 1);
-                       printf("A datetime object value: %s \n",(char 
*)(*returnValue));
+                       //printf("A datetime object value: %s \n",(char 
*)(*returnValue));
                        break; 
                case INTEGER:
-                       printf("Full object value: %s \n",objStr);
+                       //printf("Full object value: %s \n",objStr);
                        realInt = getIntFromRDFString(objStr);
-                       printf("A INTEGER object value: %i \n",realInt);
+                       //printf("A INTEGER object value: %i \n",realInt);
                        *(int*)(*returnValue) = realInt;
                        break; 
                case FLOAT:
-                       printf("Full object value: %s \n",objStr);
+                       //printf("Full object value: %s \n",objStr);
                        realFloat = getFloatFromRDFString(objStr);
-                       printf("A FLOAT object value: %f \n",realFloat);
+                       //printf("A FLOAT object value: %f \n",realFloat);
                        *(float*)(*returnValue) = realFloat;
                        break; 
                default: //URI or BLANK NODE            
-                       printf("A URI object value: " BUNFMT " \n", objOid);
+                       //printf("A URI object value: " BUNFMT " \n", objOid);
                        realUri = objOid;
                        *(oid*)(*returnValue) = realUri;
        }
@@ -5944,12 +6019,11 @@ str RDFdistTriplesToCSs(int *ret, bat *s
                        assert(objType != MULTIVALUES);         //TODO: Remove 
this
                        tmpMVColIdx = 
csPropTypes[tblIdx].lstPropTypes[tmpColIdx].colIdxes[(int)objType];
                        tmpBat = 
cstablestat->lstcstable[tblIdx].colBats[tmpColIdx];
-                       BATprint(tmpBat);
-                       BATprint(tmpmvBat);
                        getRealValue(&realObjValue, *obt, objType, mi, mbat);
 
                        for (i = 0; i < 
cstablestat->lstcstable[tblIdx].lstMVTables[tmpColIdx].numCol; i++){
                                tmpmvBat = 
cstablestat->lstcstable[tblIdx].lstMVTables[tmpColIdx].mvBats[i];
+                               BATprint(tmpmvBat);
                                if (i == tmpMVColIdx){  
                                        // TODO: If i != 0, try to cast to 
default value                
                                        BUNappend(tmpmvBat, (ptr) realObjValue, 
TRUE);
@@ -6141,7 +6215,7 @@ RDFreorganize(int *ret, CStableStat *cst
        mfreqIdxTblIdxMapping = (int *) malloc (sizeof (int) * 
freqCSset->numCSadded); 
        initIntArray(mfreqIdxTblIdxMapping , freqCSset->numCSadded, -1);
 
-       mTblIdxFreqIdxMapping = (int *) malloc (sizeof (int) * 
freqCSset->numCSadded);  // A little bit reduntdant space
+       mTblIdxFreqIdxMapping = (int *) malloc (sizeof (int) * 
freqCSset->numCSadded);  // TODO: little bit reduntdant space
        initIntArray(mTblIdxFreqIdxMapping , freqCSset->numCSadded, -1);
 
        //Mapping from from CSId to TableIdx 
@@ -6192,7 +6266,7 @@ RDFreorganize(int *ret, CStableStat *cst
        tmpLastT = curT;                
 
        // Init CStableStat
-       initCStables(cstablestat, freqCSset, csPropTypes, numTables);
+       initCStables(cstablestat, freqCSset, csPropTypes, numTables, labels, 
mTblIdxFreqIdxMapping);
        
        // Summarize the statistics
        curNumMergeCS = countNumberMergeCS(freqCSset);
diff --git a/monetdb5/extras/rdf/rdfschema.h b/monetdb5/extras/rdf/rdfschema.h
--- a/monetdb5/extras/rdf/rdfschema.h
+++ b/monetdb5/extras/rdf/rdfschema.h
@@ -178,7 +178,7 @@ typedef struct SubCSSet{
 } SubCSSet;
 
 //#define INIT_NUM_CS 9999 // workaround
-#define INIT_NUM_CS 100 // workaround
+#define INIT_NUM_CS 500 // workaround
 #define SIM_THRESHOLD 0.6
 #define SIM_TFIDF_THRESHOLD 0.55
 #define IMPORTANCE_THRESHOLD 0.01
@@ -248,6 +248,7 @@ typedef struct CStable {
        CSMVtableEx     *lstMVTables; 
        int             numCol; 
        oid*            lstProp;
+       oid             tblname;        /* Label of the table */
 } CStable; 
 
 
@@ -255,6 +256,7 @@ typedef struct CStableEx {          /* For non-d
        BAT**           colBats; 
        ObjectType*     colTypes; 
        int             numCol; 
+       oid             tblname;        /* Label of the table */
 } CStableEx; 
 
 
@@ -328,6 +330,9 @@ rdf_export str
 RDFreorganize(int *ret, CStableStat *cstablestat, bat *sbatid, bat *pbatid, 
bat *obatid, bat *mapbatid, int *freqThreshold, int *mode);
 
 rdf_export void
+getTblName(char *name, oid nameId);
+
+rdf_export void
 freeCStableStat(CStableStat *cstablestat); 
 
 rdf_export void
diff --git a/sql/backends/monet5/sql.mx b/sql/backends/monet5/sql.mx
--- a/sql/backends/monet5/sql.mx
+++ b/sql/backends/monet5/sql.mx
@@ -7672,11 +7672,14 @@ SQLrdfreorganize(Client cntxt, MalBlkPtr
        int ret = 0; 
        CStableStat *cstablestat; 
        char    tmptbname[100]; 
+       char    tmpstr[20]; 
        char    tmptbnameex[100];
        //char  tmpviewname[100]; 
        char    tmpcolname[100]; 
        //char  viewcommand[500];
-       sql_subtype tpe; 
+       sql_subtype tpe;        
+       sql_subtype tpes[50];
+
        sql_table       **cstables; 
        sql_table       ***csmvtables;  //table for storing multi-values 
        #if CSTYPE_TABLE == 1
@@ -7692,7 +7695,8 @@ SQLrdfreorganize(Client cntxt, MalBlkPtr
 
        str msg;
        BAT     *sbat, *pbat, *obat, *mbat; 
-
+       BAT     *tmpbat; 
+       
        rethrow("sql.rdfShred", msg, getSQLContext(cntxt, mb, &m, NULL));
 
        if ((sch = mvc_bind_schema(m, *schema)) == NULL)
@@ -7752,8 +7756,41 @@ SQLrdfreorganize(Client cntxt, MalBlkPtr
 
        printf("Starting creating SQL Table -- \n");
        
+       
+
+       sql_find_subtype(&tpes[TYPE_oid], "oid", 31 , 0);
+       printf("Tpes %d Type name is: %s \n", TYPE_oid, 
tpes[TYPE_oid].type->sqlname);
+
+       sql_find_subtype(&tpes[TYPE_str], "varchar", 500 , 0);
+       printf("Tpes %d Type name is: %s \n", TYPE_str, 
tpes[TYPE_str].type->sqlname);
+
+       sql_find_subtype(&tpes[TYPE_dbl], "double", 53 , 0);
+       printf("Tpes %d Type name is: %s \n", TYPE_dbl, 
tpes[TYPE_dbl].type->sqlname);
+
+       sql_find_subtype(&tpes[TYPE_int], "int", 9 , 0);
+       printf("Tpes %d Type name is: %s \n", TYPE_int, 
tpes[TYPE_int].type->sqlname);
+
+       sql_find_subtype(&tpes[TYPE_flt], "real", 23, 0);
+       printf("Tpes %d Type name is: %s \n", TYPE_flt, 
tpes[TYPE_flt].type->sqlname);
+
+       /*
+       sql_find_subtype(&tpe, "float", 0 , 0);
+       printf("Test Type name is: %s \n", tpe.type->sqlname);
+       sql_find_subtype(&tpe, "int", 9 , 0);
+       printf("Test Type name is: %s \n", tpe.type->sqlname);
+       sql_find_subtype(&tpe, "oid", 31 , 0);
+       printf("Test Type name is: %s \n", tpe.type->sqlname);
+       */
+
        for (i = 0; i < cstablestat->numTables; i++){
-               sprintf(tmptbname, "cstable%d",i);
+               printf("creating table %d \n", i);
+               //sprintf(tmptbname, "cstable%d",i);
+
+               sprintf(tmpstr, "%d",i);
+               getTblName(tmptbname, cstablestat->lstcstable[i].tblname); 
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to