Changeset: 085499b0157f for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=085499b0157f
Modified Files:
monetdb5/extras/rdf/rdfschema.c
Branch: rdf
Log Message:
Add CS with high number of referrences to the set of freqCS.
- Collect all the reference count number for each CS.
high referrence threshold = 2 * frequent threshold
- Assign correct support (frequency) and coverage for each CS in freqCSset.
diffs (truncated from 519 to 300 lines):
diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -118,13 +118,6 @@ static void initCharArray(char* inputArr
}
}
-static void generateFreqCSMap(CSset *freqCSset, char *csFreqMap){
- int i;
- for (i = 0; i < freqCSset->numCSadded; i++){
- csFreqMap[freqCSset->items[i].csId] = 1;
- }
-}
-
static
void addCStoSet(CSset *csSet, CS item)
{
@@ -323,7 +316,7 @@ void freeCSrelSet(CSrel *csrelSet, int n
}
static
-void printCSrelSet(CSrel *csrelSet, char *csFreqMap, BAT* freqBat, int num,
char isWriteTofile, int freqThreshold){
+void printCSrelSet(CSrel *csrelSet, int *csIdFreqIdxMap, BAT* freqBat, int
num, char isWriteTofile, int freqThreshold){
int i;
int j;
@@ -337,7 +330,7 @@ void printCSrelSet(CSrel *csrelSet, char
if (csrelSet[i].numRef != 0){ //Only print CS with FK
printf("Relationship %d: ", i);
freq = (int *) Tloc(freqBat, i);
- printf("CS " BUNFMT " (Freq: %d, isFreq: %d)
--> ", csrelSet[i].origCSoid, *freq, csFreqMap[i]);
+ printf("CS " BUNFMT " (Freq: %d, isFreq: %d)
--> ", csrelSet[i].origCSoid, *freq, csIdFreqIdxMap[i]);
for (j = 0; j < csrelSet[i].numRef; j++){
printf(BUNFMT " (%d) ",
csrelSet[i].lstRefCSoid[j],csrelSet[i].lstCnt[j]);
}
@@ -358,7 +351,7 @@ void printCSrelSet(CSrel *csrelSet, char
if (csrelSet[i].numRef != 0){ //Only print CS with FK
fprintf(fout, "Relationship %d: ", i);
freq = (int *) Tloc(freqBat, i);
- fprintf(fout, "CS " BUNFMT " (Freq: %d, isFreq:
%d) --> ", csrelSet[i].origCSoid, *freq, csFreqMap[i]);
+ fprintf(fout, "CS " BUNFMT " (Freq: %d, isFreq:
%d) --> ", csrelSet[i].origCSoid, *freq, csIdFreqIdxMap[i]);
for (j = 0; j < csrelSet[i].numRef; j++){
fprintf(fout, BUNFMT " (%d) ",
csrelSet[i].lstRefCSoid[j],csrelSet[i].lstCnt[j]);
}
@@ -399,7 +392,7 @@ oid getMaxCSIdFromCSId(oid csId, int* cs
static
-str printCSrelWithMaxSet(CSset *freqCSset, int* csIdFreqIdxMap, CSrel
*csrelToMaxSet, CSrel *csrelFromMaxSet, CSrel *csrelBetweenMaxSet, CSrel
*csrelSet, char *csFreqMap, BAT* freqBat, int num, int freqThreshold){
+str printCSrelWithMaxSet(CSset *freqCSset, int* csIdFreqIdxMap, CSrel
*csrelToMaxSet, CSrel *csrelFromMaxSet, CSrel *csrelBetweenMaxSet, CSrel
*csrelSet, BAT* freqBat, int num, int freqThreshold){
int i;
int j;
@@ -463,7 +456,7 @@ str printCSrelWithMaxSet(CSset *freqCSse
if (csrelToMaxSet[i].numRef != 0){ //Only print CS with FK
fprintf(fout, "Relationship %d: ", i);
freq = (int *) Tloc(freqBat, i);
- fprintf(fout, "CS " BUNFMT " (Freq: %d, isFreq: %d) -->
", csrelToMaxSet[i].origCSoid, *freq, csFreqMap[i]);
+ fprintf(fout, "CS " BUNFMT " (Freq: %d, isFreq: %d) -->
", csrelToMaxSet[i].origCSoid, *freq, csIdFreqIdxMap[i]);
for (j = 0; j < csrelToMaxSet[i].numRef; j++){
fprintf(fout, BUNFMT " (%d) ",
csrelToMaxSet[i].lstRefCSoid[j],csrelToMaxSet[i].lstCnt[j]);
}
@@ -488,8 +481,8 @@ str printCSrelWithMaxSet(CSset *freqCSse
if (csrelFromMaxSet[i].numRef != 0){ //Only print CS with FK
fprintf(fout1, "Relationship %d: ", i);
freq = (int *) Tloc(freqBat, i);
- fprintf(fout1, "CS " BUNFMT " (Freq: %d, isFreq: %d)
--> ", csrelFromMaxSet[i].origCSoid, *freq, csFreqMap[i]);
- fprintf(fout1filter, "CS " BUNFMT " (Freq: %d, isFreq:
%d) --> ", csrelFromMaxSet[i].origCSoid, *freq, csFreqMap[i]);
+ fprintf(fout1, "CS " BUNFMT " (Freq: %d, isFreq: %d)
--> ", csrelFromMaxSet[i].origCSoid, *freq, csIdFreqIdxMap[i]);
+ fprintf(fout1filter, "CS " BUNFMT " (Freq: %d, isFreq:
%d) --> ", csrelFromMaxSet[i].origCSoid, *freq, csIdFreqIdxMap[i]);
for (j = 0; j < csrelFromMaxSet[i].numRef; j++){
fprintf(fout1, BUNFMT " (%d) ",
csrelFromMaxSet[i].lstRefCSoid[j],csrelFromMaxSet[i].lstCnt[j]);
@@ -534,8 +527,8 @@ str printCSrelWithMaxSet(CSset *freqCSse
fprintf(fout2, "Relationship %d: ", i);
fprintf(fout2filter, "Relationship %d: ", i);
freq = (int *) Tloc(freqBat, i);
- fprintf(fout2, "CS " BUNFMT " (Freq: %d, isFreq: %d)
--> ", csrelBetweenMaxSet[i].origCSoid, *freq, csFreqMap[i]);
- fprintf(fout2filter, "CS " BUNFMT " (Freq: %d, isFreq:
%d) --> ", csrelBetweenMaxSet[i].origCSoid, *freq, csFreqMap[i]);
+ fprintf(fout2, "CS " BUNFMT " (Freq: %d, isFreq: %d)
--> ", csrelBetweenMaxSet[i].origCSoid, *freq, csIdFreqIdxMap[i]);
+ fprintf(fout2filter, "CS " BUNFMT " (Freq: %d, isFreq:
%d) --> ", csrelBetweenMaxSet[i].origCSoid, *freq, csIdFreqIdxMap[i]);
for (j = 0; j < csrelBetweenMaxSet[i].numRef; j++){
#if SHOWPROPERTYNAME
@@ -1098,7 +1091,8 @@ void freeCSset(CSset *csSet){
#if STOREFULLCS
for(i = 0; i < csSet->numOrigFreqCS; i ++){
- free(csSet->items[i].lstObj);
+ if (csSet->items[i].lstObj != NULL)
+ free(csSet->items[i].lstObj);
}
#endif
@@ -1176,13 +1170,17 @@ CS* creatCS(oid csId, int numP, oid* buf
cs->numAllocation = numP;
/*By default, this CS is not known to be a subset of any other CS*/
#if STOREFULLCS
- cs->lstObj = (oid*) malloc(sizeof(oid) * numP);
- if (cs->lstObj == NULL){
- printf("Malloc failed. at %d", numP);
- exit(-1);
- }
- copyOidSet(cs->lstObj, lstObject, numP);
cs->subject = subjectId;
+ if (subjectId != BUN_NONE){
+ cs->lstObj = (oid*) malloc(sizeof(oid) * numP);
+ if (cs->lstObj == NULL){
+ printf("Malloc failed. at %d", numP);
+ exit(-1);
+ }
+ copyOidSet(cs->lstObj, lstObject, numP);
+ }
+ else
+ cs->lstObj = NULL;
//printf("Create a CS with subjectId: " BUNFMT "\n", subjectId);
#endif
@@ -1401,22 +1399,23 @@ str printFreqCSSet(CSset *freqCSset, BAT
#if SHOWPROPERTYNAME
str propStr;
+ #if STOREFULLCS
str subStr;
str objStr;
oid objOid;
char objType;
+ BUN bun;
+ #endif
+ int ret;
+ char* schema = "rdf";
+
BATiter mapi;
- int ret;
- BUN bun;
- char* schema = "rdf";
-
-
+ (void) mapi;
if (TKNZRopen (NULL, &schema) != MAL_SUCCEED) {
throw(RDF, "rdf.rdfschema",
"could not open the tokenizer\n");
}
-
mapi = bat_iterator(mapbat);
#endif
@@ -1425,10 +1424,7 @@ str printFreqCSSet(CSset *freqCSset, BAT
CS cs = (CS)freqCSset->items[i];
freq = (int *) Tloc(freqBat, cs.csId);
- takeOid(cs.csId, &subStr);
-
- printf("CS " BUNFMT " (Freq: %d) | Subject: %s |
Parent " BUNFMT " \n", cs.csId, *freq, subStr,
freqCSset->items[cs.parentFreqIdx].csId);
- GDKfree(subStr);
+ printf("CS " BUNFMT " (Freq: %d) | Parent " BUNFMT "
\n", cs.csId, *freq, freqCSset->items[cs.parentFreqIdx].csId);
for (j = 0; j < cs.numProp; j++){
printf(" P:" BUNFMT " --> \n", cs.lstProp[j]);
}
@@ -1451,18 +1447,30 @@ str printFreqCSSet(CSset *freqCSset, BAT
for (i = 0; i < freqCSset->numCSadded; i++){
CS cs = (CS)freqCSset->items[i];
freq = (int *) Tloc(freqBat, cs.csId);
-
-
- takeOid(cs.subject, &subStr);
-
- fprintf(fout,"CS " BUNFMT " (Freq: %d) | Subject: %s |
FreqParentIdx %d \n", cs.csId, *freq, subStr, cs.parentFreqIdx);
-
- // Filter max freq cs set
- if (cs.type == MAXCS){
- fprintf(fout2,"CS " BUNFMT " (Freq: %d) |
Subject: %s | Parent " BUNFMT " \n", cs.csId, *freq, subStr, cs.csId);
+ if (cs.type != MAXCS) assert(*freq == cs.support);
+
+ #if STOREFULLCS
+ if (cs.subject != BUN_NONE){
+ takeOid(cs.subject, &subStr);
+
+ fprintf(fout,"CS " BUNFMT " (Freq: %d) |
Subject: %s | FreqParentIdx %d \n", cs.csId, *freq, subStr, cs.parentFreqIdx);
+
+ // Filter max freq cs set
+ if (cs.type == MAXCS){
+ fprintf(fout2,"CS " BUNFMT " (Freq: %d)
| Subject: %s | Parent " BUNFMT " \n", cs.csId, cs.support, subStr, cs.csId);
+ }
+
+ GDKfree(subStr);
}
-
- GDKfree(subStr);
+ else{
+ fprintf(fout,"CS " BUNFMT " (Freq: %d) |
Subject: NOTAVAI | FreqParentIdx %d \n", cs.csId, *freq, cs.parentFreqIdx);
+
+ if (cs.type == MAXCS){
+ fprintf(fout2,"CS " BUNFMT " (Freq: %d)
| Subject: NOTAVAI | Parent " BUNFMT " \n", cs.csId, cs.support, cs.csId);
+ }
+
+ }
+ #endif
for (j = 0; j < cs.numProp; j++){
takeOid(cs.lstProp[j], &propStr);
@@ -1473,30 +1481,34 @@ str printFreqCSSet(CSset *freqCSset, BAT
}
GDKfree(propStr);
-
+
+ #if STOREFULLCS
// Get object value
- objOid = cs.lstObj[j];
-
- objType = getObjType(objOid);
-
- if (objType == URI || objType == BLANKNODE){
- objOid = objOid - ((oid)objType <<
(sizeof(BUN)*8 - 4));
- takeOid(objOid, &objStr);
+ if (cs.lstObj != NULL){
+ objOid = cs.lstObj[j];
+
+ objType = getObjType(objOid);
+
+ if (objType == URI || objType ==
BLANKNODE){
+ objOid = objOid - ((oid)objType
<< (sizeof(BUN)*8 - 4));
+ takeOid(objOid, &objStr);
+ }
+ else{
+ objOid = objOid - (objType*2 +
1) * RDF_MIN_LITERAL; /* Get the real objOid from Map or Tokenizer */
+ bun = BUNfirst(mapbat);
+ objStr = (str) BUNtail(mapi,
bun + objOid);
+ }
+
+ fprintf(fout, " O: %s \n", objStr);
+ if (cs.type == MAXCS){
+ fprintf(fout2, " O: %s \n",
objStr);
+ }
+
+ if (objType == URI || objType ==
BLANKNODE){
+ GDKfree(objStr);
+ }
}
- else{
- objOid = objOid - (objType*2 + 1) *
RDF_MIN_LITERAL; /* Get the real objOid from Map or Tokenizer */
- bun = BUNfirst(mapbat);
- objStr = (str) BUNtail(mapi, bun +
objOid);
- }
-
- fprintf(fout, " O: %s \n", objStr);
- if (cs.type == MAXCS){
- fprintf(fout2, " O: %s \n", objStr);
- }
-
- if (objType == URI || objType == BLANKNODE){
- GDKfree(objStr);
- }
+ #endif
}
@@ -2228,8 +2240,8 @@ void getMaximumFreqCSs(CSset *freqCSset,
}
else{
freqCSset->items[i].type = MAXCS; //Update type
for this freqCS
- freqCSset->items[i].coverage += *coverage;
- freqCSset->items[i].support += *freq;
+ //freqCSset->items[i].coverage += *coverage;
+ //freqCSset->items[i].support += *freq;
}
@@ -3069,6 +3081,66 @@ str RDFassignCSId(int *ret, BAT *sbat, B
return MAL_SUCCEED;
}
+
+static
+str RDFgetRefCounts(int *ret, BAT *sbat, BATiter si, BATiter pi, BATiter oi,
oid *subjCSMap, int maxNumProp, BUN maxSoid, int *refCount){
+
+ BUN p, q;
+ oid *sbt, *pbt, *obt;
+ oid curS; /* current Subject oid */
+ oid curP; /* current Property oid */
+ int numP; /* Number of properties for current S */
+ oid* buff;
+ oid tmpCSid;
+
+ char objType;
+ oid realObjOid;
+
+ buff = (oid *) malloc (sizeof(oid) * maxNumProp);
+
+ numP = 0;
+ curP = 0;
+ curS = 0;
+
+ BATloop(sbat, p, q){
+ sbt = (oid *) BUNtloc(si, p);
+ if (*sbt != curS){
+ curS = *sbt;
+ curP = 0;
_______________________________________________
checkin-list mailing list
[email protected]
http://mail.monetdb.org/mailman/listinfo/checkin-list