Changeset: 9b6644b4d8f1 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=9b6644b4d8f1
Modified Files:
monetdb5/extras/rdf/rdfschema.c
monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:
Detect the sub-family of each CS.
- Add data structures.
- Implement all function for generating signature for each subCS, checking
duplications.
diffs (truncated from 461 to 300 lines):
diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -42,6 +42,30 @@ static void copyOidSet(oid* dest, oid* o
}
}
+
+static void copyTypesSet(char* dest, char* orig, int len){
+ memcpy(dest, orig, len * sizeof(char));
+}
+
+
+/*
+ * Hashing function for a set of values
+ * Rely on djb2 http://www.cse.yorku.ca/~oz/hash.html
+ *
+ */
+static oid RDF_hash_Tyleslist(char* types, int num){
+ //unsigned int hashCode = 5381u;
+ oid hashCode = 5381u;
+ int i;
+
+ for (i = 0; i < num; i++){
+ hashCode = ((hashCode << 5) + hashCode) + types[i];
+ }
+
+ // return 0x7fffffff & hashCode
+ return hashCode;
+}
+
/*
static void printArray(oid* inputArr, int num){
int i;
@@ -110,7 +134,7 @@ void printCSrelSet(CSrel *csrelSet, int
int j;
for (i = 0; i < num; i++){
if (csrelSet[i].numRef != 0){ //Only print CS with FK
- printf("Relationship i: ");
+ printf("Relationship %d: ", i);
printf("CS " BUNFMT " --> ", csrelSet[i].origCSoid);
for (j = 0; j < csrelSet[i].numRef; j++){
printf(BUNFMT " (%d) ",
csrelSet[i].lstRefCSoid[j],csrelSet[i].lstCnt[j]);
@@ -121,6 +145,128 @@ void printCSrelSet(CSrel *csrelSet, int
}
static
+SubCS* creatSubCS(oid subCSId, int numP, char* buff, oid subCSsign){
+ SubCS *subcs = malloc(sizeof(SubCS));
+ subcs->subTypes = (char*) malloc(sizeof(char) * numP);
+
+ copyTypesSet(subcs->subTypes, buff, numP);
+ subcs->subCSId = subCSId;
+ subcs->numSubTypes = numP;
+ subcs->sign = subCSsign;
+ return subcs;
+}
+
+static
+SubCSSet* createaSubCSSet(oid csId){
+ SubCSSet* subCSset = malloc(sizeof(SubCSSet));
+ subCSset->csId = csId;
+ subCSset->numAllocation = INIT_NUM_SUBCS;
+ subCSset->numSubCS = 0;
+ subCSset->subCSs = malloc(sizeof(SubCS) * INIT_NUM_SUBCS);
+ subCSset->freq = malloc(sizeof(int) * INIT_NUM_SUBCS);
+
+ return subCSset;
+}
+
+static
+SubCSSet* initCS_SubCSMap(oid numSubCSSet){
+ oid i;
+ SubCSSet *subcssets = malloc(sizeof(SubCSSet) * numSubCSSet);
+ SubCSSet *subcsset;
+ for (i = 0; i < numSubCSSet;i++){
+ subcsset = createaSubCSSet(i);
+ subcssets[i] = (SubCSSet) *subcsset;
+ }
+
+ return subcssets;
+
+}
+static
+char checkExistsubCS(oid subCSsign, char* types, int numTypes, SubCSSet
*subcsset, oid *existCSId){
+ char isFound = 0;
+ int i;
+ int j;
+ for (i = 0; i < subcsset->numSubCS; i++){
+ if ((subcsset->subCSs[i].sign != subCSsign) ||
(subcsset->subCSs[i].numSubTypes != numTypes))
+ continue;
+ else{
+ isFound = 1;
+ for (j = 0; j < numTypes; j++){
+ if (subcsset->subCSs[i].subTypes[j] !=
types[j]){
+ isFound = 0;
+ break;
+ }
+ }
+
+ if (isFound == 1){
+ *existCSId = i;
+ return isFound;
+ }
+ }
+ }
+
+ *existCSId = subcsset->numSubCS; //Id of new SubCS
+
+ return isFound;
+}
+
+static
+void addSubCStoSet(SubCSSet *subcsSet, SubCS item)
+{
+ void *_tmp;
+ void *_tmp2;
+
+ if(subcsSet->numSubCS == subcsSet->numAllocation)
+ {
+ subcsSet->numAllocation += INIT_NUM_SUBCS;
+
+ _tmp = realloc(subcsSet->subCSs, (subcsSet->numAllocation *
sizeof(SubCS)));
+ _tmp2 = realloc(subcsSet->freq, (subcsSet->numAllocation *
sizeof(int)));
+
+ if (!_tmp){
+ fprintf(stderr, "ERROR: Couldn't realloc memory!\n");
+ }
+ subcsSet->subCSs = (SubCS*)_tmp;
+ subcsSet->freq = (int *) _tmp2;
+ }
+
+ subcsSet->subCSs[subcsSet->numSubCS] = item;
+ subcsSet->freq[subcsSet->numSubCS] = 1;
+
+ subcsSet->numSubCS++;
+
+}
+
+static
+oid addSubCS(char *buff, int numP, int csId, SubCSSet* csSubCSMap){
+ SubCSSet *subcsset;
+ oid subCSsign;
+ char isFound;
+ oid subCSId;
+ SubCS *subCS;
+
+
+ subcsset = &(csSubCSMap[csId]);
+
+ // Check the duplication
+ subCSsign = RDF_hash_Tyleslist(buff, numP);
+
+ isFound = checkExistsubCS(subCSsign, buff, numP, subcsset, &subCSId);
+
+ if (isFound == 0){ // Add new
+ subCS = creatSubCS(subCSId, numP, buff, subCSsign);
+ addSubCStoSet(subcsset,*subCS);
+ }
+ else{ // Exist
+ //Update frequency
+ subcsset->freq[subCSId]++;
+ }
+
+ return subCSId;
+
+}
+
+static
void addReltoCSRel(oid origCSoid, oid refCSoid, CSrel *csrel)
{
void *_tmp;
@@ -193,7 +339,7 @@ void freeCS(CS *cs){
*/
static
-CS* creatCS(oid subId, int numP, oid* buff){
+CS* creatCS(oid csId, int numP, oid* buff){
CS *cs = malloc(sizeof(CS));
cs->lstProp = (oid*) malloc(sizeof(oid) * numP);
@@ -203,7 +349,7 @@ CS* creatCS(oid subId, int numP, oid* bu
}
copyOidSet(cs->lstProp, buff, numP);
- cs->subIdx = subId;
+ cs->csId = csId;
cs->numProp = numP;
cs->numAllocation = numP;
cs->isSubset = 0; /*By default, this CS is not known to be a subset of
any other CS*/
@@ -374,7 +520,7 @@ void addNewCS(CSBats *csBats, BUN* csKey
*
* */
static
-oid putaCStoHash(CSBats *csBats, oid subjId, oid* key, int num,
+oid putaCStoHash(CSBats *csBats, oid* key, int num,
oid *csoid, char isStoreFreqCS, int freqThreshold, CSset
*freqCSset){
BUN csKey;
int *freq;
@@ -412,7 +558,7 @@ oid putaCStoHash(CSBats *csBats, oid sub
if (isStoreFreqCS == 1){ /* Store the frequent
CS to the CSset*/
//printf("FreqCS: Support = %d, Threshold %d
\n ", freq, freqThreshold);
if (*freq == freqThreshold){
- freqCS = creatCS(subjId, num, key);
+ freqCS = creatCS(csId, num, key);
addCStoSet(freqCSset, *freqCS);
}
}
@@ -715,7 +861,7 @@ void freeCSBats(CSBats *csBats){
static
-str RDFassignCSId(int *ret, BAT *sbat, BATiter si, BATiter pi, CSset
*freqCSset, int *freqThreshold, CSBats* csBats, oid *subjCSMap, oid *maxCSoid){
+str RDFassignCSId(int *ret, BAT *sbat, BATiter si, BATiter pi, CSset
*freqCSset, int *freqThreshold, CSBats* csBats, oid *subjCSMap, oid *maxCSoid,
int *maxNumProp, int *maxNumPwithDup){
BUN p, q;
oid *sbt, *pbt;
@@ -723,9 +869,9 @@ str RDFassignCSId(int *ret, BAT *sbat, B
oid curP; /* current Property oid */
oid CSoid = 0; /* Characteristic set oid */
int numP; /* Number of properties for current S */
+ int numPwithDup = 0;
oid* buff;
int INIT_PROPERTY_NUM = 5000;
- int maxNumProp = 0;
oid returnCSid;
buff = (oid *) malloc (sizeof(oid) * INIT_PROPERTY_NUM);
@@ -739,12 +885,14 @@ str RDFassignCSId(int *ret, BAT *sbat, B
sbt = (oid *) BUNtloc(si, p);
if (*sbt != curS){
if (p != 0){ /* Not the first S */
- returnCSid = putaCStoHash(csBats, curS, buff,
numP, &CSoid, 1, *freqThreshold, freqCSset);
+ returnCSid = putaCStoHash(csBats, buff, numP,
&CSoid, 1, *freqThreshold, freqCSset);
subjCSMap[curS] = returnCSid;
- if (numP > maxNumProp)
- maxNumProp = numP;
+ if (numP > *maxNumProp)
+ *maxNumProp = numP;
+ if (numPwithDup > *maxNumPwithDup)
+ *maxNumPwithDup = numPwithDup;
if (returnCSid > *maxCSoid)
*maxCSoid = returnCSid;
@@ -752,6 +900,7 @@ str RDFassignCSId(int *ret, BAT *sbat, B
curS = *sbt;
curP = 0;
numP = 0;
+ numPwithDup = 0;
}
pbt = (oid *) BUNtloc(pi, p);
@@ -766,17 +915,22 @@ str RDFassignCSId(int *ret, BAT *sbat, B
numP++;
curP = *pbt;
}
+
+ numPwithDup++;
}
/*put the last CS */
- returnCSid = putaCStoHash(csBats, curS, buff, numP, &CSoid, 1,
*freqThreshold, freqCSset );
+ returnCSid = putaCStoHash(csBats, buff, numP, &CSoid, 1,
*freqThreshold, freqCSset );
subjCSMap[curS] = returnCSid;
- if (numP > maxNumProp)
- maxNumProp = numP;
-
+ if (numP > *maxNumProp)
+ *maxNumProp = numP;
+
+ if (numPwithDup > *maxNumPwithDup)
+ *maxNumPwithDup = numPwithDup;
+
if (returnCSid > *maxCSoid)
*maxCSoid = returnCSid;
@@ -788,62 +942,48 @@ str RDFassignCSId(int *ret, BAT *sbat, B
}
static
-str RDFrelationships(int *ret, BAT *sbat, BATiter si, BATiter pi, BATiter oi,
CSset *freqCSset,
- int *freqThreshold, CSBats* csBats, oid *subjCSMap, BUN
maxSoid, BUN maxCSoid){
+str RDFrelationships(int *ret, BAT *sbat, BATiter si, BATiter oi,
+ oid *subjCSMap, oid *subjSubCSMap, BUN maxSoid, BUN maxCSoid,
int maxNumPwithDup){
BUN p, q;
- oid *sbt, *pbt, *obt;
+ oid *sbt, *obt;
oid curS; /* current Subject oid */
- oid curP; /* current Property oid */
- oid CSoid = 0; /* Characteristic set oid */
- int numP; /* Number of properties for current S */
- oid* buff;
- int INIT_PROPERTY_NUM = 5000;
- int maxNumProp = 0;
_______________________________________________
checkin-list mailing list
[email protected]
http://mail.monetdb.org/mailman/listinfo/checkin-list