Changeset: ba10cc54c6de for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=ba10cc54c6de
Modified Files:
monetdb5/extras/rdf/rdfschema.c
monetdb5/extras/rdf/rdfschema.h
Branch: rdf
Log Message:
Detect subject (may be wrongly typed) based on discriminating prop. of that
type.
If the subject marked with a type, but contains discriminating property from
other type, it can be wrongly typed.
+ Add a function for printing out all the merged freqCS with the prop support
and mark REALLY INFREQUENT prop.
diffs (truncated from 558 to 300 lines):
diff --git a/monetdb5/extras/rdf/rdfschema.c b/monetdb5/extras/rdf/rdfschema.c
--- a/monetdb5/extras/rdf/rdfschema.c
+++ b/monetdb5/extras/rdf/rdfschema.c
@@ -933,7 +933,7 @@ void printCSPropTypes(CSPropTypes* csPro
/* Print cspropTypes */
for (i = 0; i < numMergedCS; i++){
- fprintf(fout, "MergedCS %d (Freq: %d): \n", i,
freqCSset->items[csPropTypes[i].freqCSId].support);
+ fprintf(fout, "MergedCS %d (Freq: %d ) \n", i,
freqCSset->items[csPropTypes[i].freqCSId].support);
tmpIsMVCS = 0;
tmpIsMVCSFilter = 0;
for(j = 0; j < csPropTypes[i].numProp; j++){
@@ -2153,7 +2153,82 @@ str printMergedFreqCSSet(CSset *freqCSse
return MAL_SUCCEED;
}
-#endif
+
+
+//Do not remove infrequent prop form final table
+static
+str printFinalTableWithPropSupport(CSPropTypes* csPropTypes, int numTables,
CSset *freqCSset, bat *mapbatid, int freqThreshold, CSlabel* labels){
+
+ int i,j;
+ int freq;
+ int freqId;
+ FILE *fout;
+ char filename[100];
+ char tmpStr[20];
+ BAT *mapbat = NULL;
+ BATiter mapi;
+ str propStr;
+ int ret;
+ char* schema = "rdf";
+ CS cs;
+
+ (void) mapi;
+ if (TKNZRopen (NULL, &schema) != MAL_SUCCEED) {
+ throw(RDF, "rdf.rdfschema",
+ "could not open the tokenizer\n");
+ }
+
+ if ((mapbat = BATdescriptor(*mapbatid)) == NULL) {
+ throw(MAL, "rdf.RDFextractCSwithTypes", RUNTIME_OBJECT_MISSING);
+ }
+ mapi = bat_iterator(mapbat);
+
+ strcpy(filename, "finalfreqCSFullInfoWithPropSupport");
+ sprintf(tmpStr, "%d", freqThreshold);
+ strcat(filename, tmpStr);
+ strcat(filename, ".txt");
+
+ fout = fopen(filename,"wt");
+
+ for (i = 0; i < numTables; i++){
+ freqId = csPropTypes[i].freqCSId;
+ cs = (CS)freqCSset->items[freqId];
+ freq = cs.support;
+
+ if (labels[freqId].name == BUN_NONE) {
+ fprintf(fout,"Table %d - FreqId %d - Name: %s (Freq:
%d | Coverage: %d) \n", i, freqId, "DUMMY", freq, cs.coverage);
+ } else {
+ str labelStr;
+
+ getStringName(labels[freqId].name, &labelStr, mapi,
mapbat, 1);
+ fprintf(fout,"Table %d - FreqId %d - Name: %s (Freq:
%d | Coverage: %d) \n", i, freqId, labelStr, freq, cs.coverage);
+ GDKfree(labelStr);
+ }
+
+
+ for (j = 0; j < cs.numProp; j++){
+ takeOid(cs.lstProp[j], &propStr);
+ //fprintf(fout, " P:" BUNFMT " --> ", cs.lstProp[j]);
+ fprintf(fout, " P(" BUNFMT ") %s | PropFreq: %d ",
cs.lstProp[j],propStr, csPropTypes[i].lstPropTypes[j].propFreq);
+ if (csPropTypes[i].lstPropTypes[j].propFreq <
STRANGE_PROP_FREQUENCY){
+ fprintf(fout, " [REALLY INFREQUENT PROP] ");
+ }
+ GDKfree(propStr);
+ fprintf(fout, "\n");
+
+ }
+ fprintf(fout, "\n");
+ }
+
+ fclose(fout);
+
+ BBPunfix(mapbat->batCacheid);
+ TKNZRclose(&ret);
+
+ return MAL_SUCCEED;
+}
+
+#endif /* NO_OUTPUTFILE == 0 */
/*
static
@@ -3459,6 +3534,165 @@ oid getMostSuitableName(CSlabel *labels,
}
#endif
+#if DETECT_INCORRECT_TYPE_SUBJECT
+
+#if USING_FINALTABLE
+static
+void buildLabelStatForTable(LabelStat *labelStat, int numTables, CStableStat*
cstablestat){
+ int i;
+ BUN bun;
+ int *_tmp;
+ int numDummy = 0;
+ oid name;
+ int tblIdx = -1;
+
+ //Preparation
+ for (i = 0; i < numTables; i++){
+ if ( cstablestat->lstcstable[i].tblname != BUN_NONE){
+ name = cstablestat->lstcstable[i].tblname;
+ bun = BUNfnd(BATmirror(labelStat->labelBat),(ptr)
&name);
+ if (bun == BUN_NONE) {
+ //New string
+ if (labelStat->labelBat->T->hash &&
BATcount(labelStat->labelBat) > 4 * labelStat->labelBat->T->hash->mask) {
+ HASHdestroy(labelStat->labelBat);
+ BAThash(BATmirror(labelStat->labelBat),
2*BATcount(labelStat->labelBat));
+ }
+
+ labelStat->labelBat =
BUNappend(labelStat->labelBat, (ptr) &name, TRUE);
+
+ if(labelStat->numLabeladded ==
labelStat->numAllocation)
+ {
+ labelStat->numAllocation +=
INIT_DISTINCT_LABEL;
+
+ _tmp = realloc(labelStat->lstCount,
(labelStat->numAllocation * sizeof(int)));
+
+ if (!_tmp){
+ fprintf(stderr, "ERROR:
Couldn't realloc memory!\n");
+ }
+ labelStat->lstCount = (int*)_tmp;
+ }
+ labelStat->lstCount[labelStat->numLabeladded] =
1;
+ labelStat->numLabeladded++;
+ }
+ else{
+ labelStat->lstCount[bun]++;
+ }
+ }
+ else
+ numDummy++;
+ }
+
+ printf("Collect label stat for final table: Total number of distinct
labels %d \n", labelStat->numLabeladded);
+ printf("Number of DUMMY freqCS: %d \n",numDummy);
+
+ //Build list of freqId corresponding to each label
+ labelStat->freqIdList = (int**) malloc(sizeof(int*) *
labelStat->numLabeladded);
+ for (i =0; i < labelStat->numLabeladded; i++){
+ labelStat->freqIdList[i] = (int*)malloc(sizeof(int) *
labelStat->lstCount[i]);
+ //reset the lstCount
+ labelStat->lstCount[i] = 0;
+ }
+
+ for (i = 0; i < numTables; i++){
+ name = cstablestat->lstcstable[i].tblname;
+ if (name != BUN_NONE){
+ bun = BUNfnd(BATmirror(labelStat->labelBat),(ptr)
&name);
+ if (bun == BUN_NONE) {
+ fprintf(stderr, "[Error] All the name should be
stored already!\n");
+ }
+ else{
+ tblIdx = labelStat->lstCount[bun];
+ labelStat->freqIdList[bun][tblIdx] = i;
+ labelStat->lstCount[bun]++;
+ }
+ }
+ }
+
+}
+
+#else /* USING_FINALTABLE = 0*/
+
+static
+void buildLabelStatForFinalMergeCS(LabelStat *labelStat, CSset *freqCSset,
CSlabel *labels){
+ int i;
+ BUN bun;
+ int *_tmp;
+ int numDummy = 0;
+ oid name;
+ int freqIdx = -1;
+
+ //Preparation
+ for (i = 0; i < freqCSset->numCSadded; i++){
+ if (freqCSset->items[i].parentFreqIdx != -1) continue;
+
+ if ( labels[i].name != BUN_NONE){
+ name = labels[i].name;
+ bun = BUNfnd(BATmirror(labelStat->labelBat),(ptr)
&name);
+ if (bun == BUN_NONE) {
+ //New string
+ if (labelStat->labelBat->T->hash &&
BATcount(labelStat->labelBat) > 4 * labelStat->labelBat->T->hash->mask) {
+ HASHdestroy(labelStat->labelBat);
+ BAThash(BATmirror(labelStat->labelBat),
2*BATcount(labelStat->labelBat));
+ }
+
+ labelStat->labelBat =
BUNappend(labelStat->labelBat, (ptr) &name, TRUE);
+
+ if(labelStat->numLabeladded ==
labelStat->numAllocation)
+ {
+ labelStat->numAllocation +=
INIT_DISTINCT_LABEL;
+
+ _tmp = realloc(labelStat->lstCount,
(labelStat->numAllocation * sizeof(int)));
+
+ if (!_tmp){
+ fprintf(stderr, "ERROR:
Couldn't realloc memory!\n");
+ }
+ labelStat->lstCount = (int*)_tmp;
+ }
+ labelStat->lstCount[labelStat->numLabeladded] =
1;
+ labelStat->numLabeladded++;
+ }
+ else{
+ labelStat->lstCount[bun]++;
+ }
+ }
+ else
+ numDummy++;
+ }
+
+ printf("Collect label stat for final mergeCS: Total number of distinct
labels %d \n", labelStat->numLabeladded);
+ printf("Number of DUMMY freqCS: %d \n",numDummy);
+
+ //Build list of freqId corresponding to each label
+ labelStat->freqIdList = (int**) malloc(sizeof(int*) *
labelStat->numLabeladded);
+ for (i =0; i < labelStat->numLabeladded; i++){
+ labelStat->freqIdList[i] = (int*)malloc(sizeof(int) *
labelStat->lstCount[i]);
+ //reset the lstCount
+ labelStat->lstCount[i] = 0;
+ }
+
+
+ for (i = 0; i < freqCSset->numCSadded; i++){
+ if (freqCSset->items[i].parentFreqIdx != -1) continue;
+
+ name = labels[i].name;
+ if (name != BUN_NONE){
+ bun = BUNfnd(BATmirror(labelStat->labelBat),(ptr)
&name);
+ if (bun == BUN_NONE) {
+ fprintf(stderr, "[Error] All the name should be
stored already!\n");
+ }
+ else{
+ freqIdx = labelStat->lstCount[bun];
+ labelStat->freqIdList[bun][freqIdx] = i;
+ labelStat->lstCount[bun]++;
+ }
+ }
+ }
+
+}
+#endif
+
+#endif
+
static
void buildLabelStat(LabelStat *labelStat, CSlabel *labels, CSset *freqCSset,
int k){
int i,j;
@@ -3545,6 +3779,7 @@ void buildLabelStat(LabelStat *labelStat
}
}
+
static
void freeLabelStat(LabelStat *labelStat){
int i;
@@ -5258,6 +5493,219 @@ str RDFassignCSId(int *ret, BAT *sbat, B
return MAL_SUCCEED;
}
+#if DETECT_INCORRECT_TYPE_SUBJECT
+
+static
+str RDFcheckWrongTypeSubject(BAT *sbat, BATiter si, BATiter pi, BATiter oi,
CSset *freqCSset, int maxNumPwithDup, int numTables, int
*mTblIdxFreqIdxMapping, LabelStat *labelStat, oid *subjCSMap, int
*csFreqCSMapping){
+
+ BUN p, q;
+ oid *sbt, *pbt, *obt;
+ oid curS; /* current Subject oid */
+ oid curP; /* current Property oid */
+ int numP; /* Number of properties for current S */
+ int numPwithDup = 0;
+ oid* buff;
+
+ //Only keep the most specific ontology-based rdftype value
+ int maxNumOntology = 20;
+ oid* rdftypeOntologyValues = NULL;
+ char* rdftypeSelectedValues = NULL; //Store which value is selected
+ char* rdftypeSpecificLevels = NULL; //Store specific level for each
value
+ BUN* rdftypeOntClassPos = NULL; //Position in the ontology class
+
+ int numTypeValues = 0;
+ #if EXTRAINFO_FROM_RDFTYPE
+ int tmpMaxSpecificLevel = 0;
+ int tmpSpecificLevel = 0;
+ BUN tmpOntClassPos = BUN_NONE; //index of the ontology class in
the ontmetaBat
+ PropStat *ontPropStat = NULL;
_______________________________________________
checkin-list mailing list
[email protected]
https://www.monetdb.org/mailman/listinfo/checkin-list