Changeset: 9ce92aca2707 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=9ce92aca2707
Modified Files:
        monetdb5/extras/rdf/rdfalgebra.c
        monetdb5/extras/rdf/rdfschema.c
Branch: rdf
Log Message:

Assign new Ids for subject, predicate and object bats and sub-sorted them 
according to P, S, O order


diffs (273 lines):

diff --git a/monetdb5/extras/rdf/rdfalgebra.c b/monetdb5/extras/rdf/rdfalgebra.c
--- a/monetdb5/extras/rdf/rdfalgebra.c
+++ b/monetdb5/extras/rdf/rdfalgebra.c
@@ -110,12 +110,6 @@ RDFpartialjoin(bat *retid, bat *lid, bat
 }
 */
 
-/*TODO: Modify the above function by using 
- * BATsubouterjoin
- *  
- * */
-
-
 str
 RDFpartialjoin(bat *retid, bat *lid, bat *rid, bat *inputid){
        BAT *left, *right, *result1, *result2, *result, *input;  
diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -2865,6 +2865,79 @@ RDFextractPfromPSO(int *ret, bat *pbatid
 
 }
 
+static 
+BAT* getOriginalOBat(BAT *obat){
+       BAT*    origobat; 
+       BATiter oi; 
+       BUN     p,q; 
+       oid     *obt; 
+       char    objType; 
+
+       origobat = BATcopy(obat,  obat->htype, obat->ttype, TRUE);
+       oi = bat_iterator(origobat); 
+       
+       BATloop(origobat, p, q){
+
+               obt = (oid *) BUNtloc(oi, p); 
+               /* Check type of object */
+               objType = (char) ((*obt) >> (sizeof(BUN)*8 - 4))  &  7 ;        
/* Get two bits 63th, 62nd from object oid */
+       
+               if (objType == URI || objType == BLANKNODE){
+                       *obt = (*obt) - ((oid)objType << (sizeof(BUN)*8 - 4));
+               }
+               // Note that for the oid of literal data type, we do not need
+               // to remove the object oid since the map bat also use this
+               // oid
+
+       }
+       
+       return origobat; 
+}
+/*
+static 
+oid getTblidFromSoid(oid Soid){
+       int     freqCSid;       
+       
+       return freqCSid; 
+}
+*/
+
+static
+str triplesubsort(BAT **sbat, BAT **pbat, BAT **obat){
+
+       BAT *o1,*o2,*o3;
+       BAT *g1,*g2,*g3;
+       BAT *S = NULL, *P = NULL, *O = NULL;
+
+       S = *sbat;
+       P = *pbat;
+       O = *obat;
+       /* order SPO/SOP */
+       if (BATsubsort(sbat, &o1, &g1, S, NULL, NULL, 0, 0) == GDK_FAIL){
+               if (S != NULL) BBPreclaim(S);
+               throw(RDF, "rdf.triplesubsort", "Fail in sorting for S");
+       }
+
+       if (BATsubsort(pbat, &o2, &g2, P, o1, g1, 0, 0) == GDK_FAIL){
+               BBPreclaim(S);
+               if (P != NULL) BBPreclaim(P);
+               throw(RDF, "rdf.triplesubsort", "Fail in sub-sorting for P");
+       }
+       if (BATsubsort(obat, &o3, &g3, O, o2, g2, 0, 0) == GDK_FAIL){
+               BBPreclaim(S);
+               BBPreclaim(P);
+               if (O != NULL) BBPreclaim(O);
+               throw(RDF, "rdf.triplesubsort", "Fail in sub-sorting for O");
+       }       
+
+       BBPunfix(o2->batCacheid);
+       BBPunfix(g2->batCacheid);
+       BBPunfix(o3->batCacheid);
+       BBPunfix(g3->batCacheid);
+
+       return MAL_SUCCEED; 
+}
+
 
 str
 RDFreorganize(int *ret, bat *sbatid, bat *pbatid, bat *obatid, bat *mapbatid, 
int *freqThreshold){
@@ -2874,25 +2947,27 @@ RDFreorganize(int *ret, bat *sbatid, bat
        oid             *csFreqCSMap;   
        int             i; 
        oid             maxCSoid = 0; 
-       BAT             *sbat = NULL;
+       BAT             *sbat = NULL, *obat = NULL, *pbat = NULL;
        BATiter         si; 
        BUN             p,q; 
-       BAT             *sNewBat; 
+       BAT             *sNewBat, *lmap, *rmap, *oNewBat, *origobat, *pNewBat; 
        BUN             newId; 
        oid             *sbt; 
        oid             *lastSubjId;    /* Store the last subject Id in each 
freqCS */
        oid             freqId; 
-       oid             lastS; 
-
+       oid             lastS;
+       oid             l,r; 
+       bat             oNewBatid, pNewBatid; 
+       
        freqCSset = initCSset();
 
        if (RDFextractCSwithTypes(ret, sbatid, pbatid, obatid, mapbatid, 
freqThreshold, freqCSset,&subjCSMap, &maxCSoid) != MAL_SUCCEED){
                throw(RDF, "rdf.RDFreorganize", "Problem in extracting CSs");
        } 
        
-       printf("Start re-organizing triple store \n");
-       csFreqCSMap = (oid *) malloc (sizeof (oid) * maxCSoid); 
-       initArray(csFreqCSMap, maxCSoid, BUN_NONE);
+       printf("Start re-organizing triple store for " BUNFMT " CSs \n", 
maxCSoid);
+       csFreqCSMap = (oid *) malloc (sizeof (oid) * (maxCSoid + 1)); 
+       initArray(csFreqCSMap, (maxCSoid + 1), BUN_NONE);
 
 
        lastSubjId = (oid *) malloc (sizeof(oid) * freqCSset->numOrigFreqCS); 
@@ -2905,43 +2980,123 @@ RDFreorganize(int *ret, bat *sbatid, bat
                throw(MAL, "rdf.RDFreorganize", RUNTIME_OBJECT_MISSING);
        }
 
-       sNewBat = BATnew(TYPE_void, TYPE_oid, smallbatsz);
-
+       if ((obat = BATdescriptor(*obatid)) == NULL) {
+               BBPreleaseref(sbat->batCacheid);
+               throw(MAL, "rdf.RDFreorganize", RUNTIME_OBJECT_MISSING);
+       }
+
+       if ((pbat = BATdescriptor(*pbatid)) == NULL) {
+               BBPreleaseref(sbat->batCacheid);
+               BBPreleaseref(obat->batCacheid);
+               throw(MAL, "rdf.RDFreorganize", RUNTIME_OBJECT_MISSING);
+       }
+
+       sNewBat = BATnew(TYPE_void, TYPE_oid, BATcount(sbat));
        if (sNewBat== NULL) {
                throw(MAL, "rdf.RDFreorganize", RUNTIME_OBJECT_MISSING);
        }
-
+       BATseqbase(sNewBat, 0);
+       
+       lmap = BATnew(TYPE_void, TYPE_oid, smallbatsz);
+
+       if (lmap == NULL) {
+               throw(MAL, "rdf.RDFreorganize", RUNTIME_OBJECT_MISSING);
+       }
+       lmap->tsorted = TRUE;
+
+       BATseqbase(lmap, 0);
+       
+       rmap = BATnew(TYPE_void, TYPE_oid, smallbatsz);
+       if (rmap == NULL) {
+               throw(MAL, "rdf.RDFreorganize", RUNTIME_OBJECT_MISSING);
+       }
+
+       BATseqbase(rmap, 0);
+       
        si = bat_iterator(sbat); 
 
-       printf("Re-assigning Subject oids \n");
-       lastS = 0; 
+       printf("Re-assigning Subject oids ... ");
+       lastS = -1; 
        BATloop(sbat, p, q){
                sbt = (oid *) BUNtloc(si, p);
                freqId = csFreqCSMap[subjCSMap[*sbt]];
 
                if (freqId != BUN_NONE){
 
-                       if (lastS != *sbt){     //new subject
-                               lastSubjId[freqId]++;
-                               lastS = *sbt; 
-                       }
-
-                       //newId = csFreqCSMap[subjCSMap[*sbt]] * 10000 + p; 
-                       
                        newId = lastSubjId[freqId];
                        newId |= (BUN)freqId << (sizeof(BUN)*8 - 
NBITS_FOR_CSID);
 
-                       sNewBat = BUNappend(sNewBat, &newId, TRUE);
+                       if (lastS != *sbt){     //new subject
+                               lastS = *sbt; 
+
+                               l = *sbt; 
+                               r = newId; 
+
+                               lmap = BUNappend(lmap, &l, TRUE);
+                               rmap = BUNappend(rmap, &r, TRUE);
+                               lastSubjId[freqId]++;
+                       }
+
                }
+               else{   // Use original subject Id
+                       newId = *sbt; 
+               }
+
+               sNewBat = BUNappend(sNewBat, &newId, TRUE);
 
        }
 
+
+       //BATprint(VIEWcreate(BATmirror(lmap),rmap)); 
+       
+       origobat = getOriginalOBat(obat); 
+
+       //BATprint(origobat);
+       
+       if (RDFpartialjoin(&oNewBatid, &lmap->batCacheid, &rmap->batCacheid, 
&origobat->batCacheid) == MAL_SUCCEED){
+               if ((oNewBat = BATdescriptor(oNewBatid)) == NULL) {
+                       throw(MAL, "rdf.RDFreorganize", RUNTIME_OBJECT_MISSING);
+               }
+       }
+       else
+               throw(RDF, "rdf.RDFreorganize", "Problem in using 
RDFpartialjoin for obat");
+
+
+       if (RDFpartialjoin(&pNewBatid, &lmap->batCacheid, &rmap->batCacheid, 
&pbat->batCacheid) == MAL_SUCCEED){
+               if ((pNewBat = BATdescriptor(pNewBatid)) == NULL) {
+                       throw(MAL, "rdf.RDFreorganize", RUNTIME_OBJECT_MISSING);
+               }
+       }
+       else
+               throw(RDF, "rdf.RDFreorganize", "Problem in using 
RDFpartialjoin for obat");
+
+       //BATprint(oNewBat);
+       printf("Done! \n");
+       
+       printf("Sort triple table according to P, S, O order ... ");
+       if (triplesubsort(&pNewBat, &sNewBat, &oNewBat) != MAL_SUCCEED){
+               throw(RDF, "rdf.RDFreorganize", "Problem in sorting PSO");      
+       }       
+       printf("Done  \n");
+
+       BATprint(pNewBat);
+
+       BATprint(sNewBat);
+
+               
        freeCSset(freqCSset); 
-       free (subjCSMap); 
+       free(subjCSMap); 
        free(csFreqCSMap);
-
+       
+       BBPreclaim(lmap);
+       BBPreclaim(rmap); 
        BBPreclaim(sbat);
        BBPreclaim(sNewBat);
+       BBPreclaim(obat); 
+       BBPreclaim(origobat);
+       BBPreclaim(oNewBat); 
+       BBPreclaim(pbat); 
+       BBPreclaim(pNewBat); 
 
        return MAL_SUCCEED; 
 }
_______________________________________________
checkin-list mailing list
[email protected]
http://mail.monetdb.org/mailman/listinfo/checkin-list

Reply via email to