Changeset: 085499b0157f for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=085499b0157f
Modified Files:
        monetdb5/extras/rdf/rdfschema.c
Branch: rdf
Log Message:

Add CS with high number of referrences to the set of freqCS.

- Collect all the reference count number for each CS.

high referrence threshold = 2 * frequent threshold

- Assign correct support (frequency) and coverage for each CS in freqCSset.


diffs (truncated from 519 to 300 lines):

diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -118,13 +118,6 @@ static void initCharArray(char* inputArr
        }
 }
 
-static void generateFreqCSMap(CSset *freqCSset, char *csFreqMap){
-       int i; 
-       for (i = 0; i < freqCSset->numCSadded; i++){
-               csFreqMap[freqCSset->items[i].csId] = 1;
-       }
-}
-
 static 
 void addCStoSet(CSset *csSet, CS item)
 {
@@ -323,7 +316,7 @@ void freeCSrelSet(CSrel *csrelSet, int n
 }
 
 static 
-void printCSrelSet(CSrel *csrelSet, char *csFreqMap, BAT* freqBat, int num, 
char isWriteTofile, int freqThreshold){
+void printCSrelSet(CSrel *csrelSet, int *csIdFreqIdxMap, BAT* freqBat, int 
num, char isWriteTofile, int freqThreshold){
 
        int     i; 
        int     j; 
@@ -337,7 +330,7 @@ void printCSrelSet(CSrel *csrelSet, char
                        if (csrelSet[i].numRef != 0){   //Only print CS with FK
                                printf("Relationship %d: ", i);
                                freq  = (int *) Tloc(freqBat, i);
-                               printf("CS " BUNFMT " (Freq: %d, isFreq: %d) 
--> ", csrelSet[i].origCSoid, *freq, csFreqMap[i]);
+                               printf("CS " BUNFMT " (Freq: %d, isFreq: %d) 
--> ", csrelSet[i].origCSoid, *freq, csIdFreqIdxMap[i]);
                                for (j = 0; j < csrelSet[i].numRef; j++){
                                        printf(BUNFMT " (%d) ", 
csrelSet[i].lstRefCSoid[j],csrelSet[i].lstCnt[j]);      
                                }       
@@ -358,7 +351,7 @@ void printCSrelSet(CSrel *csrelSet, char
                        if (csrelSet[i].numRef != 0){   //Only print CS with FK
                                fprintf(fout, "Relationship %d: ", i);
                                freq  = (int *) Tloc(freqBat, i);
-                               fprintf(fout, "CS " BUNFMT " (Freq: %d, isFreq: 
%d) --> ", csrelSet[i].origCSoid, *freq, csFreqMap[i]);
+                               fprintf(fout, "CS " BUNFMT " (Freq: %d, isFreq: 
%d) --> ", csrelSet[i].origCSoid, *freq, csIdFreqIdxMap[i]);
                                for (j = 0; j < csrelSet[i].numRef; j++){
                                        fprintf(fout, BUNFMT " (%d) ", 
csrelSet[i].lstRefCSoid[j],csrelSet[i].lstCnt[j]);       
                                }       
@@ -399,7 +392,7 @@ oid getMaxCSIdFromCSId(oid csId, int* cs
 
 
 static 
-str printCSrelWithMaxSet(CSset *freqCSset, int* csIdFreqIdxMap, CSrel 
*csrelToMaxSet, CSrel *csrelFromMaxSet, CSrel *csrelBetweenMaxSet, CSrel 
*csrelSet, char *csFreqMap, BAT* freqBat, int num, int freqThreshold){
+str printCSrelWithMaxSet(CSset *freqCSset, int* csIdFreqIdxMap, CSrel 
*csrelToMaxSet, CSrel *csrelFromMaxSet, CSrel *csrelBetweenMaxSet, CSrel 
*csrelSet, BAT* freqBat, int num, int freqThreshold){
 
        int     i; 
        int     j; 
@@ -463,7 +456,7 @@ str printCSrelWithMaxSet(CSset *freqCSse
                if (csrelToMaxSet[i].numRef != 0){      //Only print CS with FK
                        fprintf(fout, "Relationship %d: ", i);
                        freq  = (int *) Tloc(freqBat, i);
-                       fprintf(fout, "CS " BUNFMT " (Freq: %d, isFreq: %d) --> 
", csrelToMaxSet[i].origCSoid, *freq, csFreqMap[i]);
+                       fprintf(fout, "CS " BUNFMT " (Freq: %d, isFreq: %d) --> 
", csrelToMaxSet[i].origCSoid, *freq, csIdFreqIdxMap[i]);
                        for (j = 0; j < csrelToMaxSet[i].numRef; j++){
                                fprintf(fout, BUNFMT " (%d) ", 
csrelToMaxSet[i].lstRefCSoid[j],csrelToMaxSet[i].lstCnt[j]);     
                        }       
@@ -488,8 +481,8 @@ str printCSrelWithMaxSet(CSset *freqCSse
                if (csrelFromMaxSet[i].numRef != 0){    //Only print CS with FK
                        fprintf(fout1, "Relationship %d: ", i);
                        freq  = (int *) Tloc(freqBat, i);
-                       fprintf(fout1, "CS " BUNFMT " (Freq: %d, isFreq: %d) 
--> ", csrelFromMaxSet[i].origCSoid, *freq, csFreqMap[i]);
-                       fprintf(fout1filter, "CS " BUNFMT " (Freq: %d, isFreq: 
%d) --> ", csrelFromMaxSet[i].origCSoid, *freq, csFreqMap[i]);           
+                       fprintf(fout1, "CS " BUNFMT " (Freq: %d, isFreq: %d) 
--> ", csrelFromMaxSet[i].origCSoid, *freq, csIdFreqIdxMap[i]);
+                       fprintf(fout1filter, "CS " BUNFMT " (Freq: %d, isFreq: 
%d) --> ", csrelFromMaxSet[i].origCSoid, *freq, csIdFreqIdxMap[i]);             
 
 
                        for (j = 0; j < csrelFromMaxSet[i].numRef; j++){
                                fprintf(fout1, BUNFMT " (%d) ", 
csrelFromMaxSet[i].lstRefCSoid[j],csrelFromMaxSet[i].lstCnt[j]);        
@@ -534,8 +527,8 @@ str printCSrelWithMaxSet(CSset *freqCSse
                        fprintf(fout2, "Relationship %d: ", i);
                        fprintf(fout2filter, "Relationship %d: ", i);
                        freq  = (int *) Tloc(freqBat, i);
-                       fprintf(fout2, "CS " BUNFMT " (Freq: %d, isFreq: %d) 
--> ", csrelBetweenMaxSet[i].origCSoid, *freq, csFreqMap[i]);
-                       fprintf(fout2filter, "CS " BUNFMT " (Freq: %d, isFreq: 
%d) --> ", csrelBetweenMaxSet[i].origCSoid, *freq, csFreqMap[i]);
+                       fprintf(fout2, "CS " BUNFMT " (Freq: %d, isFreq: %d) 
--> ", csrelBetweenMaxSet[i].origCSoid, *freq, csIdFreqIdxMap[i]);
+                       fprintf(fout2filter, "CS " BUNFMT " (Freq: %d, isFreq: 
%d) --> ", csrelBetweenMaxSet[i].origCSoid, *freq, csIdFreqIdxMap[i]);
 
                        for (j = 0; j < csrelBetweenMaxSet[i].numRef; j++){
                                #if SHOWPROPERTYNAME
@@ -1098,7 +1091,8 @@ void freeCSset(CSset *csSet){
 
        #if STOREFULLCS
        for(i = 0; i < csSet->numOrigFreqCS; i ++){
-               free(csSet->items[i].lstObj);
+               if (csSet->items[i].lstObj != NULL)
+                       free(csSet->items[i].lstObj);
        }
        #endif
 
@@ -1176,13 +1170,17 @@ CS* creatCS(oid csId, int numP, oid* buf
        cs->numAllocation = numP; 
        /*By default, this CS is not known to be a subset of any other CS*/
        #if STOREFULLCS
-       cs->lstObj =  (oid*) malloc(sizeof(oid) * numP);
-       if (cs->lstObj == NULL){
-               printf("Malloc failed. at %d", numP);
-               exit(-1); 
-       }
-       copyOidSet(cs->lstObj, lstObject, numP); 
        cs->subject = subjectId; 
+       if (subjectId != BUN_NONE){
+               cs->lstObj =  (oid*) malloc(sizeof(oid) * numP);
+               if (cs->lstObj == NULL){
+                       printf("Malloc failed. at %d", numP);
+                       exit(-1); 
+               }
+               copyOidSet(cs->lstObj, lstObject, numP); 
+               }
+       else
+               cs->lstObj = NULL; 
        //printf("Create a CS with subjectId: " BUNFMT "\n", subjectId);
        #endif
 
@@ -1401,22 +1399,23 @@ str printFreqCSSet(CSset *freqCSset, BAT
 
 #if SHOWPROPERTYNAME
        str     propStr; 
+       #if STOREFULLCS
        str     subStr; 
        str     objStr; 
        oid     objOid; 
        char    objType; 
+       BUN     bun; 
+       #endif
+       int     ret; 
+       char*   schema = "rdf";
+       
        BATiter mapi; 
-       int     ret; 
-       BUN     bun; 
-       char*   schema = "rdf";
-
-
+       (void) mapi;
        if (TKNZRopen (NULL, &schema) != MAL_SUCCEED) {
                throw(RDF, "rdf.rdfschema",
                                "could not open the tokenizer\n");
        }
        
-
        mapi = bat_iterator(mapbat); 
 #endif 
 
@@ -1425,10 +1424,7 @@ str printFreqCSSet(CSset *freqCSset, BAT
                        CS cs = (CS)freqCSset->items[i];
                        freq  = (int *) Tloc(freqBat, cs.csId);
 
-                       takeOid(cs.csId, &subStr);      
-
-                       printf("CS " BUNFMT " (Freq: %d) | Subject: %s  |  
Parent " BUNFMT " \n", cs.csId, *freq, subStr, 
freqCSset->items[cs.parentFreqIdx].csId);
-                       GDKfree(subStr);
+                       printf("CS " BUNFMT " (Freq: %d) | Parent " BUNFMT " 
\n", cs.csId, *freq, freqCSset->items[cs.parentFreqIdx].csId);
                        for (j = 0; j < cs.numProp; j++){
                                printf("  P:" BUNFMT " --> \n", cs.lstProp[j]); 
                        }       
@@ -1451,18 +1447,30 @@ str printFreqCSSet(CSset *freqCSset, BAT
                for (i = 0; i < freqCSset->numCSadded; i++){
                        CS cs = (CS)freqCSset->items[i];
                        freq  = (int *) Tloc(freqBat, cs.csId);
-                       
-
-                       takeOid(cs.subject, &subStr);   
-                       
-                       fprintf(fout,"CS " BUNFMT " (Freq: %d) | Subject: %s  | 
FreqParentIdx %d \n", cs.csId, *freq, subStr, cs.parentFreqIdx);
-
-                       // Filter max freq cs set
-                       if (cs.type == MAXCS){
-                               fprintf(fout2,"CS " BUNFMT " (Freq: %d) | 
Subject: %s  | Parent " BUNFMT " \n", cs.csId, *freq, subStr, cs.csId);
+                       if (cs.type != MAXCS) assert(*freq == cs.support);
+
+                       #if STOREFULLCS 
+                       if (cs.subject != BUN_NONE){
+                               takeOid(cs.subject, &subStr);   
+                               
+                               fprintf(fout,"CS " BUNFMT " (Freq: %d) | 
Subject: %s  | FreqParentIdx %d \n", cs.csId, *freq, subStr, cs.parentFreqIdx);
+
+                               // Filter max freq cs set
+                               if (cs.type == MAXCS){
+                                       fprintf(fout2,"CS " BUNFMT " (Freq: %d) 
| Subject: %s  | Parent " BUNFMT " \n", cs.csId, cs.support, subStr, cs.csId);
+                               }
+                                       
+                               GDKfree(subStr);
                        }
-                               
-                       GDKfree(subStr);
+                       else{
+                               fprintf(fout,"CS " BUNFMT " (Freq: %d) | 
Subject: NOTAVAI  | FreqParentIdx %d \n", cs.csId, *freq, cs.parentFreqIdx);
+
+                               if (cs.type == MAXCS){
+                                       fprintf(fout2,"CS " BUNFMT " (Freq: %d) 
| Subject: NOTAVAI  | Parent " BUNFMT " \n", cs.csId, cs.support, cs.csId);
+                               }
+                                       
+                       }
+                       #endif  
 
                        for (j = 0; j < cs.numProp; j++){
                                takeOid(cs.lstProp[j], &propStr);
@@ -1473,30 +1481,34 @@ str printFreqCSSet(CSset *freqCSset, BAT
                                }
 
                                GDKfree(propStr);
-
+                               
+                               #if STOREFULLCS
                                // Get object value
-                               objOid = cs.lstObj[j]; 
-
-                               objType = getObjType(objOid); 
-
-                               if (objType == URI || objType == BLANKNODE){
-                                       objOid = objOid - ((oid)objType << 
(sizeof(BUN)*8 - 4));
-                                       takeOid(objOid, &objStr); 
+                               if (cs.lstObj != NULL){
+                                       objOid = cs.lstObj[j]; 
+
+                                       objType = getObjType(objOid); 
+
+                                       if (objType == URI || objType == 
BLANKNODE){
+                                               objOid = objOid - ((oid)objType 
<< (sizeof(BUN)*8 - 4));
+                                               takeOid(objOid, &objStr); 
+                                       }
+                                       else{
+                                               objOid = objOid - (objType*2 + 
1) *  RDF_MIN_LITERAL;   /* Get the real objOid from Map or Tokenizer */ 
+                                               bun = BUNfirst(mapbat);
+                                               objStr = (str) BUNtail(mapi, 
bun + objOid); 
+                                       }
+
+                                       fprintf(fout, "  O: %s \n", objStr);
+                                       if (cs.type == MAXCS){
+                                               fprintf(fout2, "  O: %s \n", 
objStr);
+                                       }
+
+                                       if (objType == URI || objType == 
BLANKNODE){
+                                               GDKfree(objStr);
+                                       }
                                }
-                               else{
-                                       objOid = objOid - (objType*2 + 1) *  
RDF_MIN_LITERAL;   /* Get the real objOid from Map or Tokenizer */ 
-                                       bun = BUNfirst(mapbat);
-                                       objStr = (str) BUNtail(mapi, bun + 
objOid); 
-                               }
-
-                               fprintf(fout, "  O: %s \n", objStr);
-                               if (cs.type == MAXCS){
-                                       fprintf(fout2, "  O: %s \n", objStr);
-                               }
-
-                               if (objType == URI || objType == BLANKNODE){
-                                       GDKfree(objStr);
-                               }
+                               #endif
 
 
                        }       
@@ -2228,8 +2240,8 @@ void getMaximumFreqCSs(CSset *freqCSset,
                }
                else{
                        freqCSset->items[i].type = MAXCS;       //Update type 
for this freqCS
-                       freqCSset->items[i].coverage += *coverage;
-                       freqCSset->items[i].support += *freq;
+                       //freqCSset->items[i].coverage += *coverage;
+                       //freqCSset->items[i].support += *freq;
 
                }
 
@@ -3069,6 +3081,66 @@ str RDFassignCSId(int *ret, BAT *sbat, B
        return MAL_SUCCEED; 
 }
 
+
+static 
+str RDFgetRefCounts(int *ret, BAT *sbat, BATiter si, BATiter pi, BATiter oi, 
oid *subjCSMap, int maxNumProp, BUN maxSoid, int *refCount){
+
+       BUN             p, q; 
+       oid             *sbt, *pbt, *obt; 
+       oid             curS;           /* current Subject oid */
+       oid             curP;           /* current Property oid */
+       int             numP;           /* Number of properties for current S */
+       oid*            buff;    
+       oid             tmpCSid; 
+
+       char            objType;
+       oid             realObjOid;     
+
+       buff = (oid *) malloc (sizeof(oid) * maxNumProp);
+
+       numP = 0;
+       curP = 0; 
+       curS = 0; 
+
+       BATloop(sbat, p, q){
+               sbt = (oid *) BUNtloc(si, p);           
+               if (*sbt != curS){
+                       curS = *sbt; 
+                       curP = 0;
_______________________________________________
checkin-list mailing list
[email protected]
http://mail.monetdb.org/mailman/listinfo/checkin-list

Reply via email to