MonetDB: rdf - Assign labels to CS's and CS properties.

Linnea Passing Fri, 21 Jun 2013 08:10:18 -0700

Changeset: 56f9183f2a24 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=56f9183f2a24
Added Files:
        monetdb5/extras/rdf/rdflabels.c
        monetdb5/extras/rdf/rdflabels.h
Modified Files:
        monetdb5/extras/rdf/Makefile.ag
        monetdb5/extras/rdf/rdfontologyload.c
        monetdb5/extras/rdf/rdfontologyload.h
        monetdb5/extras/rdf/rdfschema.c
        monetdb5/extras/rdf/rdfschema.mal
Branch: rdf
Log Message:


Assign labels to CS's and CS properties.
Compute candidates and choose the best name. Generate textual and graphical 
representations of the resulting schema.


diffs (truncated from 1963 to 300 lines):

diff --git a/monetdb5/extras/rdf/Makefile.ag b/monetdb5/extras/rdf/Makefile.ag
--- a/monetdb5/extras/rdf/Makefile.ag
+++ b/monetdb5/extras/rdf/Makefile.ag
@@ -31,7 +31,7 @@ lib__rdf = {
        #MODULE
        NOINST
        #DIR = libdir/monetdb5
-       SOURCES = rdf.h rdfschema.h rdfparser.h rdfparser.c rdfontologyload.h 
rdfontologyload.c rdf_shredder.c rdfalgebra.c rdfschema.c 
+       SOURCES = rdf.h rdfschema.h rdflabels.h rdfparser.h rdfparser.c 
rdfontologyload.h rdfontologyload.c rdf_shredder.c rdfalgebra.c rdfschema.c 
rdflabels.c 
 
        #SEP = _
        # LIBS =  ./hashmap/librdfhash  
diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
new file mode 100644
--- /dev/null
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -0,0 +1,1662 @@
+/*
+ * The contents of this file are subject to the MonetDB Public License
+ * Version 1.1 (the "License"); you may not use this file except in
+ * compliance with the License. You may obtain a copy of the License at
+ * http://www.monetdb.org/Legal/MonetDBLicense
+ *
+ * Software distributed under the License is distributed on an "AS IS"
+ * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+ * License for the specific language governing rights and limitations
+ * under the License.
+ *
+ * The Original Code is the MonetDB Database System.
+ *
+ * The Initial Developer of the Original Code is CWI.
+ * Portions created by CWI are Copyright (C) 1997-July 2008 CWI.
+ * Copyright August 2008-2012 MonetDB B.V.
+ * All Rights Reserved.
+ */
+
+#include "monetdb_config.h"
+#include "rdf.h"
+#include "rdflabels.h"
+#include "rdfschema.h"
+#include "tokenizer.h"
+#include <math.h>
+
+// list of known ontologies
+int ontologyCount = 73;
+ontology ontologies[] = {
+{{"http:", "www.facebook.com", "2008"}, 3},
+{{"http:", "facebook.com", "2008"}, 3},
+{{"http:", "developers.facebook.com", "schema"}, 3},
+{{"https:", "www.facebook.com", "2008"}, 3},
+{{"http:", "purl.org", "dc", "elements", "1.1"}, 5}, // dc DublinCore
+{{"http:", "purl.org", "dc", "terms"}, 4}, // DublinCore
+{{"http:", "purl.org", "goodrelations", "v1"}, 4}, // GoodRelations
+{{"http:", "purl.org", "rss", "1.0", "modules"}, 5},
+{{"http:", "purl.org", "stuff"}, 3},
+{{"http:", "www.purl.org", "stuff"}, 3},
+{{"http:", "ogp.me", "ns"}, 3},
+{{"https:", "ogp.me", "ns"}, 3},
+{{"http:", "www.w3.org", "1999", "02", "22-rdf-syntax-ns"}, 5}, // rdf
+{{"http:", "www.w3.org", "2000", "01", "rdf-schema"}, 5}, // rdfs
+{{"http:", "www.w3.org", "2004", "02", "skos", "core"}, 6}, // skos (Simple 
Knowledge Organization System)
+{{"http:", "www.w3.org", "2002", "07", "owl"}, 5},
+{{"http:", "www.w3.org", "2006", "vcard", "ns"}, 5}, // vcard
+{{"http:", "www.w3.org", "2001", "vcard-rdf", "3.0"}, 5},
+{{"http:", "www.w3.org", "2003", "01", "geo", "wgs84_pos"}, 6}, // geo
+{{"http:", "www.w3.org", "1999", "xhtml", "vocab"}, 5}, // xhtml
+{{"http:", "search.yahoo.com", "searchmonkey"}, 3},
+{{"https:", "search.yahoo.com", "searchmonkey"}, 3},
+{{"http:", "search.yahoo.co.jp", "searchmonkey"}, 3},
+{{"http:", "g.yahoo.com", "searchmonkey"}, 3},
+{{"http:", "opengraphprotocol.org", "schema"}, 3},
+{{"https:", "opengraphprotocol.org", "schema"}, 3},
+{{"http:", "opengraph.org", "schema"}, 3},
+{{"https:", "opengraph.org", "schema"}, 3},
+{{"http:", "creativecommons.org", "ns"}, 3}, // cc
+{{"http:", "rdf.data-vocabulary.org"}, 2}, // by google
+{{"http:", "rdfs.org", "sioc", "ns"}, 4}, // sioc (pronounced "shock", 
Semantically-Interlinked Online Communities Project)
+{{"http:", "xmlns.com", "foaf", "0.1"}, 4}, // foaf (Friend of a Friend)
+{{"http:", "mixi-platform.com", "ns"}, 3}, // japanese social graph
+{{"http:", "commontag.org", "ns"}, 3},
+{{"http:", "semsl.org", "ontology"}, 3}, // semantic web for second life
+{{"http:", "schema.org"}, 2},
+{{"http:", "openelectiondata.org", "0.1"}, 3},
+{{"http:", "search.aol.com", "rdf"}, 3},
+{{"http:", "www.loc.gov", "loc.terms", "relators"}, 4}, // library of congress
+{{"http:", "dbpedia.org", "ontology"}, 3}, // dbo
+{{"http:", "dbpedia.org", "resource"}, 3}, // dbpedia
+{{"http:", "dbpedia.org", "property"}, 3}, // dbp
+{{"http:", "www.aktors.org", "ontology", "portal"}, 4}, // akt (research, 
publications, ...)
+{{"http:", "purl.org", "ontology", "bibo"}, 4}, // bibo (bibliography)
+{{"http:", "purl.org", "ontology", "mo"}, 4}, // mo (music)
+{{"http:", "www.geonames.org", "ontology"}, 3}, // geonames
+{{"http:", "purl.org", "vocab", "frbr", "core"}, 5}, // frbr (Functional 
Requirements for Bibliographic Records)
+{{"http:", "www.w3.org", "2001", "XMLSchema"}, 4}, // xsd
+{{"http:", "www.w3.org", "2006", "time"}, 4}, // time
+{{"http:", "purl.org", "NET", "c4dm", "event.owl"}, 5}, // event
+{{"http:", "www.openarchives.org", "ore", "terms"}, 4}, // ore (Open Archive)
+{{"http:", "purl.org", "vocab", "bio", "0.1"}, 5}, // bio (biographical data)
+{{"http:", "www.holygoat.co.uk", "owl", "redwood", "0.1", "tags"}, 6}, // tag
+{{"http:", "rdfs.org", "ns", "void"}, 4}, // void (Vocabulary of Interlinked 
Datasets)
+{{"http:", "www.w3.org", "2006", "http"}, 4}, // http
+{{"http:", "purl.uniprot.org", "core"}, 3}, // uniprot (protein annotation)
+{{"http:", "umbel.org", "umbel"}, 3}, // umbel (Upper Mapping and Binding 
Exchange Layer)
+{{"http:", "purl.org", "stuff", "rev"}, 4}, // rev (review)
+{{"http:", "purl.org", "linked-data", "cube"}, 4}, // qb (data cube)
+{{"http:", "www.w3.org", "ns", "org"}, 4}, // org (organizations)
+{{"http:", "purl.org", "vocab", "vann"}, 4}, // vann (vocabulary for 
annotating vocabulary descriptions)
+{{"http:", "data.ordnancesurvey.co.uk", "ontology", "admingeo"}, 4}, // 
admingeo (administrative geography and civil voting area)
+{{"http:", "www.w3.org", "2007", "05", "powder-s"}, 5}, // wdrs (Web 
Description Resources)
+{{"http:", "usefulinc.com", "ns", "doap"}, 4}, // doap (Description of a 
Project)
+{{"http:", "lod.taxonconcept.org", "ontology", "txn.owl"}, 4}, // txn 
(TaxonConcept, species)
+{{"http:", "xmlns.com", "wot", "0.1"}, 4}, // wot (Web Of Trust)
+{{"http:", "purl.org", "net", "compass"}, 4}, // compass
+{{"http:", "www.w3.org", "2004", "03", "trix", "rdfg-1"}, 6}, // rdfg (RDF 
graph)
+{{"http:", "purl.org", "NET", "c4dm", "timeline.owl"}, 5}, // tl (timeline)
+{{"http:", "purl.org", "dc", "dcam"}, 4}, // dcam (DublinCore metadata)
+{{"http:", "swrc.ontoware.org", "ontology"}, 3}, // swrc (university, research)
+{{"http:", "zeitkunst.org", "bibtex", "0.1", "bibtex.owl"}, 5}, // bib (bibTeX 
entries)
+{{"http:", "purl.org", "ontology", "po"}, 4} // po (tv and radio programmes)
+};
+
+#if USE_SHORT_NAMES
+/* Extracts the "human-readable" part of an URI (usually the last token). */
+static
+void getPropNameShort(char* name, char* propStr) {
+       char            *token;
+       char            uri[1000];
+       int             length = 0;             // number of tokens
+       char            **tokenizedUri = NULL;  // list of tokens
+       int             i, j;
+       int             fit;
+
+       // tokenize uri
+       strcpy(uri, propStr); // uri will be modified during tokenization
+       token = strtok(uri, "/#");
+       while (token != NULL) {
+               tokenizedUri = realloc(tokenizedUri, sizeof(char*) * ++length);
+               if (!tokenizedUri) fprintf(stderr, "ERROR: Couldn't realloc 
memory!\n");
+               tokenizedUri[length - 1] = token;
+               token = strtok(NULL, "/#");
+       }
+
+       // match with ontologies
+       for (j = 0; j < ontologyCount; ++j) {
+               if (length > ontologies[j].length) {
+                       fit = 1;
+                       for (i = 0; fit && i < ontologies[j].length; ++i) {
+                               if (strcmp(ontologies[j].uri[i], 
tokenizedUri[i]) != 0) {
+                                       fit = 0;
+                               }
+                       }
+                       if (fit) {
+                               // found matching ontology, create label
+                               for (i = ontologies[j].length; i < length; ++i) 
{
+                                       strcat(name, tokenizedUri[i]);
+                                       strcat(name, "_"); // if label consists 
of >=2 tokens, use underscores
+                               }
+                               // remove trailing underscore
+                               name[strlen(name) - 1] = '\0';
+
+                               free(tokenizedUri);
+                               return;
+                       }
+               }
+       }
+
+       // no matching ontology found, return content of last token
+
+       if (length == 1) {
+               // value
+               strcat(name, propStr);
+       } else {
+               strcat(name, tokenizedUri[length - 1]);
+       }
+
+       free(tokenizedUri);
+       return;
+}
+#endif
+
+static
+int** initTypeAttributesHistogramCount(int typeAttributesCount, int num) {
+       int             i, j;
+       int**           typeAttributesHistogramCount;
+
+       typeAttributesHistogramCount = (int **) malloc(sizeof(int *) * num);
+       if (!typeAttributesHistogramCount) fprintf(stderr, "ERROR: Couldn't 
malloc memory!\n");
+       for (i = 0; i < num; ++i) {
+               typeAttributesHistogramCount[i] = (int *) malloc(sizeof(int) * 
typeAttributesCount);
+               if (!typeAttributesHistogramCount[i]) fprintf(stderr, "ERROR: 
Couldn't malloc memory!\n");
+               for (j = 0; j < typeAttributesCount; ++j) {
+                       typeAttributesHistogramCount[i][j] = 0;
+               }
+       }
+
+       return typeAttributesHistogramCount;
+}
+
+static
+TypeAttributesFreq*** initTypeAttributesHistogram(int typeAttributesCount, int 
** typeAttributesHistogramCount, int num) {
+       int                     i, j, k;
+       TypeAttributesFreq***   typeAttributesHistogram;
+
+       typeAttributesHistogram = (TypeAttributesFreq ***) 
malloc(sizeof(TypeAttributesFreq **) * num);
+       if (!typeAttributesHistogram) fprintf(stderr, "ERROR: Couldn't malloc 
memory!\n");
+       for (i = 0; i < num; ++i) {
+               typeAttributesHistogram[i] = (TypeAttributesFreq **) malloc 
(sizeof(TypeAttributesFreq *) * typeAttributesCount);
+               if (!typeAttributesHistogram[i]) fprintf(stderr, "ERROR: 
Couldn't malloc memory!\n");
+               for (j = 0; j < typeAttributesCount; ++j) {
+                       typeAttributesHistogram[i][j] = (TypeAttributesFreq *) 
malloc (sizeof(TypeAttributesFreq) * typeAttributesHistogramCount[i][j]);
+                       if (!typeAttributesHistogram[i][j]) fprintf(stderr, 
"ERROR: Couldn't malloc memory!\n");
+                       for (k = 0; k < typeAttributesHistogramCount[i][j]; 
++k) {
+                               typeAttributesHistogram[i][j][k].freq = 0;
+                               typeAttributesHistogram[i][j][k].percent = 0;
+                       }
+               }
+       }
+
+       return typeAttributesHistogram;
+}
+
+static
+int** initRelationMetadataCount(CSset* freqCSset) {
+       int             i, j;
+       int**           relationMetadataCount;
+
+       relationMetadataCount = (int **) malloc(sizeof(int *) * 
freqCSset->numCSadded);
+       if (!relationMetadataCount) fprintf(stderr, "ERROR: Couldn't malloc 
memory!\n");
+       for (i = 0; i < freqCSset->numCSadded; ++i) {
+               relationMetadataCount[i] = NULL;
+               if (freqCSset->items[i].parentFreqIdx != -1) continue; // ignore
+               relationMetadataCount[i] = (int *) malloc(sizeof(int) * 
freqCSset->items[i].numProp);
+               if (!relationMetadataCount[i]) fprintf(stderr, "ERROR: Couldn't 
malloc memory!\n");
+               for (j = 0; j < freqCSset->items[i].numProp; ++j) {
+                       relationMetadataCount[i][j] = 0;
+               }
+       }
+
+       return relationMetadataCount;
+}
+
+/* Calculate frequency per foreign key relationship. */
+static
+Relation*** initRelationMetadata(int** relationMetadataCount, CSmergeRel* 
csRelBetweenMergeFreqSet, CSset* freqCSset) {
+       int             i, j, k;
+       Relation***     relationMetadata;
+
+       int             ret;
+       char*           schema = "rdf";
+
+       TKNZRopen (NULL, &schema);
+
+       relationMetadata = (Relation ***) malloc(sizeof(Relation **) * 
freqCSset->numCSadded);
+       if (!relationMetadata) fprintf(stderr, "ERROR: Couldn't malloc 
memory!\n");
+       for (i = 0; i < freqCSset->numCSadded; ++i) { // CS
+               CS cs = (CS) freqCSset->items[i];
+               if (cs.parentFreqIdx != -1) continue; // ignore
+               relationMetadata[i] = (Relation **) malloc (sizeof(Relation *) 
* cs.numProp);
+               if (!relationMetadata[i]) fprintf(stderr, "ERROR: Couldn't 
malloc memory!\n");
+               for (j = 0; j < cs.numProp; ++j) { // propNo in CS order
+                       int sum = 0;
+                       relationMetadataCount[i][j] = 0;
+                       relationMetadata[i][j] = NULL;
+                       for (k = 0; k < csRelBetweenMergeFreqSet[i].numRef; 
++k) { // propNo in CSrel
+
+                               if (csRelBetweenMergeFreqSet[i].lstPropId[k] == 
cs.lstProp[j]) {
+                                       int toId = 
csRelBetweenMergeFreqSet[i].lstRefFreqIdx[k];
+                                       relationMetadataCount[i][j] += 1;
+
+                                       // alloc/realloc
+                                       if (relationMetadataCount[i][j] == 1) {
+                                               // alloc
+                                               relationMetadata[i][j] = 
(Relation *) malloc (sizeof(Relation));
+                                               if (!relationMetadata[i][j]) 
fprintf(stderr, "ERROR: Couldn't malloc memory!\n");
+                                               relationMetadata[i][j][0].to = 
toId;
+                                               relationMetadata[i][j][0].from 
= i;
+                                               relationMetadata[i][j][0].freq 
= csRelBetweenMergeFreqSet[i].lstCnt[k];
+                                               
relationMetadata[i][j][0].percent = -1;
+                                       } else {
+                                               // realloc
+                                               relationMetadata[i][j] = 
(Relation *) realloc(relationMetadata[i][j], sizeof(Relation) * 
relationMetadataCount[i][j]);
+                                               if (!relationMetadata[i][j]) 
fprintf(stderr, "ERROR: Couldn't realloc memory!\n");
+                                               
relationMetadata[i][j][relationMetadataCount[i][j] - 1].to = toId;
+                                               
relationMetadata[i][j][relationMetadataCount[i][j] - 1].from = i;
+                                               
relationMetadata[i][j][relationMetadataCount[i][j] - 1].freq = 
csRelBetweenMergeFreqSet[i].lstCnt[k];
+                                               
relationMetadata[i][j][relationMetadataCount[i][j] - 1].percent = -1;
+                                       }
+                               }
+                       }
+
+                       // get total count of values
+                       for (k = 0; k < relationMetadataCount[i][j]; ++k) {
+                               sum += relationMetadata[i][j][k].freq;
+                       }
+                       // assign percentage values for every value
+                       for (k = 0; k < relationMetadataCount[i][j]; ++k) {
+                               relationMetadata[i][j][k].percent = (int) 
(100.0 * relationMetadata[i][j][k].freq / sum + 0.5);
+                       }
+               }
+       }
_______________________________________________
checkin-list mailing list
[email protected]
http://mail.monetdb.org/mailman/listinfo/checkin-list

MonetDB: rdf - Assign labels to CS's and CS properties.

Reply via email to