Changeset: e7109fc24610 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=e7109fc24610
Modified Files:
        monetdb5/extras/rdf/rdfschema.c
        monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:

Generate sample data for all tables


diffs (truncated from 721 to 300 lines):

diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -695,6 +695,11 @@ char isInfrequentSampleProp(CS freqCS, i
        if (freqCS.lstPropSupport[propIdx] * 100 < freqCS.support * 
SAMPLE_FILTER_THRESHOLD) return 1; 
        else return 0;
 }
+static
+char isInfrequentSampleCol(CS freqCS, PropTypes pt){
+       if (pt.propFreq * 100 <  freqCS.support * SAMPLE_FILTER_THRESHOLD) 
return 1;
+       else return 0; 
+}
 
 static 
 void genCSPropTypesColIdx(CSPropTypes* csPropTypes, int numMergedCS, CSset* 
freqCSset){
@@ -4830,6 +4835,211 @@ void initSampleData(CSSample *csSample,B
 
        }
 }
+
+
+static
+void getSubjIdFromTablePosition(int tblIdx, int pos, oid *sOid){
+       oid id; 
+       id = pos;
+       id |= (BUN)(tblIdx + 1) << (sizeof(BUN)*8 - NBITS_FOR_CSID);
+       *sOid = id; 
+}
+
+static
+str getOrigSbt(oid *sbt, oid *origSbt, BAT *lmap, BAT *rmap){
+       BUN pos; 
+       oid *tmp; 
+       pos = BUNfnd(BATmirror(rmap),sbt);
+       if (pos == BUN_NONE){
+               throw(RDF, "rdf.RDFdistTriplesToCSs", "This encoded subject 
must be in rmap");
+       }
+       tmp = (oid *) Tloc(lmap, pos);
+       if (*tmp == BUN_NONE){
+               throw(RDF, "rdf.RDFdistTriplesToCSs", "The encoded subject must 
be in lmap");
+       }
+
+       *origSbt = *tmp;                
+
+       return MAL_SUCCEED; 
+}
+
+static
+str getOrigObt(oid *obt, oid *origObt, BAT *lmap, BAT *rmap){
+       BUN pos; 
+       oid *tmp; 
+       oid     tmporigOid = BUN_NONE; 
+       char objType; 
+       BUN     maxObjectURIOid =  ((oid)1 << (sizeof(BUN)*8 - NBITS_FOR_CSID - 
1)) - 1; //Base on getTblIdxFromS
+
+       objType = getObjType(*obt); 
+
+       if (objType == URI || objType == BLANKNODE){
+               tmporigOid = (*obt) - ((oid)objType << (sizeof(BUN)*8 - 4));
+       }
+       
+       if (tmporigOid > maxObjectURIOid){
+               pos = BUNfnd(BATmirror(rmap),&tmporigOid);
+               if (pos == BUN_NONE){
+                       throw(RDF, "rdf.RDFdistTriplesToCSs", "This encoded 
object must be in rmap");
+               }
+               tmp = (oid *) Tloc(lmap, pos);
+               if (*tmp == BUN_NONE){
+                       throw(RDF, "rdf.RDFdistTriplesToCSs", "The encoded 
object must be in lmap");
+               }
+
+               *origObt = *tmp;                
+       }
+       else{
+               *origObt = tmporigOid;
+       }
+
+       return MAL_SUCCEED; 
+}
+
+static 
+str initFullSampleData(CSSampleExtend *csSampleEx, int *mTblIdxFreqIdxMapping, 
CSlabel *label, CStableStat* cstablestat, CSPropTypes *csPropTypes, CSset 
*freqCSset, int numTables,  bat *lmapbatid, bat *rmapbatid){
+       int     i, j, k; 
+       int     freqId; 
+       int     tmpNumcand; 
+       oid     tmpCandidate; 
+       int     randValue = 0; 
+       int     ranPosition = 0;        //random position of the instance in a 
table
+       int     tmpNumCols; 
+       int     colIdx; 
+       BAT     *tmpbat = NULL;
+       BATiter tmpi; 
+       BAT     *cursamplebat = NULL; 
+       int     tmpNumRows = 0; 
+       oid     tmpSoid = BUN_NONE, origSoid = BUN_NONE;  
+       oid     origOid = BUN_NONE; 
+       BAT     *lmap = NULL, *rmap = NULL; 
+
+       if ((lmap = BATdescriptor(*lmapbatid)) == NULL) {
+               throw(MAL, "rdf.RDFdistTriplesToCSs", RUNTIME_OBJECT_MISSING);
+       }
+       
+       if ((rmap = BATdescriptor(*rmapbatid)) == NULL) {
+               BBPreleaseref(lmap->batCacheid);
+               throw(MAL, "rdf.RDFdistTriplesToCSs", RUNTIME_OBJECT_MISSING);
+       }
+       srand(123456); 
+       for (i = 0; i < numTables; i++){
+               freqId = mTblIdxFreqIdxMapping[i];
+               csSampleEx[i].freqIdx = freqId;
+               tmpNumcand = (NUM_SAMPLE_CANDIDATE > 
label[freqId].candidatesCount)?label[freqId].candidatesCount:NUM_SAMPLE_CANDIDATE;
+               csSampleEx[i].name = cstablestat->lstcstable[i].tblname; 
+               csSampleEx[i].candidateCount = tmpNumcand;
+               csSampleEx[i].candidates = (oid*)malloc(sizeof(oid) * 
tmpNumcand); 
+               for (k = 0; k < tmpNumcand; k++){
+                       csSampleEx[i].candidates[k] = 
label[freqId].candidates[k]; 
+               }
+               //Randomly exchange the value, change the position k with a 
random pos
+               for (k = 0; k < tmpNumcand; k++){
+                       randValue = rand() % tmpNumcand;
+                       tmpCandidate = csSampleEx[i].candidates[k];
+                       csSampleEx[i].candidates[k] = 
csSampleEx[i].candidates[randValue];
+                       csSampleEx[i].candidates[randValue] = tmpCandidate;
+               }
+
+               csSampleEx[i].lstSubjOid = (oid*)malloc(sizeof(oid) * 
NUM_SAMPLE_INSTANCE);
+               for (k = 0; k < NUM_SAMPLE_INSTANCE; k++)
+                       csSampleEx[i].lstSubjOid[k] = BUN_NONE; 
+
+               tmpNumCols = csPropTypes[i].numProp -  
csPropTypes[i].numInfreqProp; //already remove infrequent column;
+               csSampleEx[i].numProp = tmpNumCols;
+               
+               assert(tmpNumCols > 0); 
+                       
+               csSampleEx[i].lstProp = (oid*)malloc(sizeof(oid) * tmpNumCols); 
+               csSampleEx[i].lstIsInfrequentProp = (char*)malloc(sizeof(char) 
* tmpNumCols); 
+               csSampleEx[i].lstIsMVCol = (char*)malloc(sizeof(char) * 
tmpNumCols); 
+               csSampleEx[i].colBats = (BAT**)malloc(sizeof(BAT*) * 
tmpNumCols);
+               colIdx = -1;
+               csSampleEx[i].numInstances = 0;
+               for(j = 0; j < csPropTypes[i].numProp; j++){
+                       #if     REMOVE_INFREQ_PROP
+                       if (csPropTypes[i].lstPropTypes[j].defColIdx == -1)     
continue;  //Infrequent prop
+                       #endif
+                       colIdx++;
+                       csSampleEx[i].lstProp[colIdx] = 
csPropTypes[i].lstPropTypes[j].prop;
+                       
+                       csSampleEx[i].colBats[colIdx] = BATnew(TYPE_void, 
cstablestat->lstcstable[i].colBats[colIdx]->ttype , NUM_SAMPLE_INSTANCE + 1);
+
+                       //Mark whether this col is infrequent sample cols
+                       if ( isInfrequentSampleCol(freqCSset->items[freqId], 
csPropTypes[i].lstPropTypes[j])){
+                               csSampleEx[i].lstIsInfrequentProp[colIdx] = 1;
+                       }
+                       else
+                               csSampleEx[i].lstIsInfrequentProp[colIdx] = 0;
+
+                       //Mark whther this col is a MV col
+                       csSampleEx[i].lstIsMVCol[colIdx] = 
csPropTypes[i].lstPropTypes[j].isMVProp;
+                       
+                       //if this is a multivalue column, get the data type of 
the first column
+
+               }
+               assert(colIdx == (tmpNumCols - 1)); 
+
+               
+               // Inserting instances to csSampleEx
+               
+               tmpNumRows = BATcount(cstablestat->lstcstable[i].colBats[0]);
+               
+               for (k = 0; k < NUM_SAMPLE_INSTANCE; k++){
+                       ranPosition = rand() % tmpNumRows;
+
+                       getSubjIdFromTablePosition(i, ranPosition, &tmpSoid);   
+                       
+                       if (getOrigSbt(&tmpSoid, &origSoid, lmap, rmap) != 
MAL_SUCCEED){
+                               throw(RDF, "rdf.RDFdistTriplesToCSs","Problem 
in getting the orignal sbt ");
+                       } 
+
+                       csSampleEx[i].lstSubjOid[k] = origSoid;
+
+                       for (j = 0; j < tmpNumCols; j++){
+                               cursamplebat = csSampleEx[i].colBats[j];
+
+                               tmpbat = cstablestat->lstcstable[i].colBats[j]; 
        
+                               tmpi = bat_iterator(tmpbat);
+
+                               if (tmpbat->ttype == TYPE_oid && 
csSampleEx[i].lstIsMVCol[j] == 0){
+                                       //Get the original object oid
+                                       oid *tmpOid = (oid *) BUNtail(tmpi, 
ranPosition);
+                                       if(*tmpOid != oid_nil){
+                                               if (getOrigObt(tmpOid, 
&origOid, lmap, rmap) != MAL_SUCCEED){
+                                                       throw(RDF, 
"rdf.RDFdistTriplesToCSs","Problem in getting the orignal obt ");
+                                               }
+                                               BUNappend(cursamplebat, 
&origOid, TRUE);
+                                       }
+                                       else{
+                                               BUNappend(cursamplebat, 
ATOMnilptr(TYPE_oid), TRUE);
+                                       }
+
+                               }
+                               else
+                                       BUNappend(cursamplebat, BUNtail(tmpi, 
ranPosition), TRUE);
+
+
+                               
+                       }
+                       csSampleEx[i].numInstances++;
+               }
+
+               if (i == 0)
+                       for (j = 0; j < tmpNumCols; j++){
+                               
//BATprint(cstablestat->lstcstable[i].colBats[j]);
+                               BATprint(csSampleEx[i].colBats[j]);
+                       }
+               
+       }
+
+       BBPunfix(lmap->batCacheid);
+       BBPunfix(rmap->batCacheid);
+
+       return MAL_SUCCEED;
+
+}
+
 static 
 void freeSampleData(CSSample *csSample, int numCand){
        int i, j; 
@@ -4846,6 +5056,25 @@ void freeSampleData(CSSample *csSample, 
        free(csSample);
 }
 
+
+static 
+void freeSampleExData(CSSampleExtend *csSampleEx, int numCand){
+       int i, j; 
+       for (i = 0; i < numCand; i++){
+               free(csSampleEx[i].lstProp);
+               free(csSampleEx[i].lstIsInfrequentProp);
+               free(csSampleEx[i].lstIsMVCol);
+               free(csSampleEx[i].candidates); 
+               free(csSampleEx[i].lstSubjOid);
+               for (j = 0; j < csSampleEx[i].numProp; j++){
+                       BBPunfix(csSampleEx[i].colBats[j]->batCacheid);
+               }
+               free(csSampleEx[i].colBats);
+       }
+
+       free(csSampleEx);
+}
+
 static 
 void addSampleInstance(oid subj, oid *buffO, oid* buffP, int numP, int 
sampleIdx, CSSample *csSample){
        int i,j; 
@@ -5217,6 +5446,295 @@ str printSampleData(CSSample *csSample, 
        return MAL_SUCCEED;
 }
 
+#if 0
+static 
+str printFullSampleData(CSSampleExtend *csSampleEx, CSset *freqCSset, BAT 
*mbat, int num, int sampleVersion){
+
+       int     i,j, k; 
+       FILE    *fout, *fouttb, *foutis; 
+       char    filename[100], filename2[100], filename3[100];
+       char    tmpStr[20], tmpStr2[20], tmpStr3[20];
+       int     ret;
+
+       str     propStr; 
+       str     subjStr; 
+       char*   schema = "rdf";
+       CSSample        sample; 
+       CS              freqCS; 
+       char    objType = 0; 
+       str     objStr;         
+       oid     objOid = BUN_NONE; 
+       BATiter mapi;
+       str     canStr; 
+       char    isTitle = 0; 
+       char    isUrl = 0;
+       char    isType = 0;
+       char    isDescription = 0;
+       char    isImage = 0; 
+       char    isSite = 0;
+       char    isEmail = 0; 
+       char    isCountry = 0; 
+       char    isLocality = 0;
+       BAT     *lmap = NULL, *rmap = NULL
+#if USE_SHORT_NAMES
+       str     propStrShort = NULL;
+       char    *pch; 
+#endif
+
+
+
+       mapi = bat_iterator(mbat);
+
+       if (TKNZRopen (NULL, &schema) != MAL_SUCCEED) {
+               throw(RDF, "rdf.rdfschema",
+                               "could not open the tokenizer\n");
+       }
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list

Reply via email to