Changeset: 56f9183f2a24 for MonetDB
URL: http://dev.monetdb.org/hg/MonetDB?cmd=changeset;node=56f9183f2a24
Added Files:
monetdb5/extras/rdf/rdflabels.c
monetdb5/extras/rdf/rdflabels.h
Modified Files:
monetdb5/extras/rdf/Makefile.ag
monetdb5/extras/rdf/rdfontologyload.c
monetdb5/extras/rdf/rdfontologyload.h
monetdb5/extras/rdf/rdfschema.c
monetdb5/extras/rdf/rdfschema.mal
Branch: rdf
Log Message:
Assign labels to CS's and CS properties.
Compute candidates and choose the best name. Generate textual and graphical
representations of the resulting schema.
diffs (truncated from 1963 to 300 lines):
diff --git a/monetdb5/extras/rdf/Makefile.ag b/monetdb5/extras/rdf/Makefile.ag
--- a/monetdb5/extras/rdf/Makefile.ag
+++ b/monetdb5/extras/rdf/Makefile.ag
@@ -31,7 +31,7 @@ lib__rdf = {
#MODULE
NOINST
#DIR = libdir/monetdb5
- SOURCES = rdf.h rdfschema.h rdfparser.h rdfparser.c rdfontologyload.h
rdfontologyload.c rdf_shredder.c rdfalgebra.c rdfschema.c
+ SOURCES = rdf.h rdfschema.h rdflabels.h rdfparser.h rdfparser.c
rdfontologyload.h rdfontologyload.c rdf_shredder.c rdfalgebra.c rdfschema.c
rdflabels.c
#SEP = _
# LIBS = ./hashmap/librdfhash
diff --git a/monetdb5/extras/rdf/rdflabels.c b/monetdb5/extras/rdf/rdflabels.c
new file mode 100644
--- /dev/null
+++ b/monetdb5/extras/rdf/rdflabels.c
@@ -0,0 +1,1662 @@
+/*
+ * The contents of this file are subject to the MonetDB Public License
+ * Version 1.1 (the "License"); you may not use this file except in
+ * compliance with the License. You may obtain a copy of the License at
+ * http://www.monetdb.org/Legal/MonetDBLicense
+ *
+ * Software distributed under the License is distributed on an "AS IS"
+ * basis, WITHOUT WARRANTY OF ANY KIND, either express or implied. See the
+ * License for the specific language governing rights and limitations
+ * under the License.
+ *
+ * The Original Code is the MonetDB Database System.
+ *
+ * The Initial Developer of the Original Code is CWI.
+ * Portions created by CWI are Copyright (C) 1997-July 2008 CWI.
+ * Copyright August 2008-2012 MonetDB B.V.
+ * All Rights Reserved.
+ */
+
+#include "monetdb_config.h"
+#include "rdf.h"
+#include "rdflabels.h"
+#include "rdfschema.h"
+#include "tokenizer.h"
+#include <math.h>
+
+// list of known ontologies
+int ontologyCount = 73;
+ontology ontologies[] = {
+{{"http:", "www.facebook.com", "2008"}, 3},
+{{"http:", "facebook.com", "2008"}, 3},
+{{"http:", "developers.facebook.com", "schema"}, 3},
+{{"https:", "www.facebook.com", "2008"}, 3},
+{{"http:", "purl.org", "dc", "elements", "1.1"}, 5}, // dc DublinCore
+{{"http:", "purl.org", "dc", "terms"}, 4}, // DublinCore
+{{"http:", "purl.org", "goodrelations", "v1"}, 4}, // GoodRelations
+{{"http:", "purl.org", "rss", "1.0", "modules"}, 5},
+{{"http:", "purl.org", "stuff"}, 3},
+{{"http:", "www.purl.org", "stuff"}, 3},
+{{"http:", "ogp.me", "ns"}, 3},
+{{"https:", "ogp.me", "ns"}, 3},
+{{"http:", "www.w3.org", "1999", "02", "22-rdf-syntax-ns"}, 5}, // rdf
+{{"http:", "www.w3.org", "2000", "01", "rdf-schema"}, 5}, // rdfs
+{{"http:", "www.w3.org", "2004", "02", "skos", "core"}, 6}, // skos (Simple
Knowledge Organization System)
+{{"http:", "www.w3.org", "2002", "07", "owl"}, 5},
+{{"http:", "www.w3.org", "2006", "vcard", "ns"}, 5}, // vcard
+{{"http:", "www.w3.org", "2001", "vcard-rdf", "3.0"}, 5},
+{{"http:", "www.w3.org", "2003", "01", "geo", "wgs84_pos"}, 6}, // geo
+{{"http:", "www.w3.org", "1999", "xhtml", "vocab"}, 5}, // xhtml
+{{"http:", "search.yahoo.com", "searchmonkey"}, 3},
+{{"https:", "search.yahoo.com", "searchmonkey"}, 3},
+{{"http:", "search.yahoo.co.jp", "searchmonkey"}, 3},
+{{"http:", "g.yahoo.com", "searchmonkey"}, 3},
+{{"http:", "opengraphprotocol.org", "schema"}, 3},
+{{"https:", "opengraphprotocol.org", "schema"}, 3},
+{{"http:", "opengraph.org", "schema"}, 3},
+{{"https:", "opengraph.org", "schema"}, 3},
+{{"http:", "creativecommons.org", "ns"}, 3}, // cc
+{{"http:", "rdf.data-vocabulary.org"}, 2}, // by google
+{{"http:", "rdfs.org", "sioc", "ns"}, 4}, // sioc (pronounced "shock",
Semantically-Interlinked Online Communities Project)
+{{"http:", "xmlns.com", "foaf", "0.1"}, 4}, // foaf (Friend of a Friend)
+{{"http:", "mixi-platform.com", "ns"}, 3}, // japanese social graph
+{{"http:", "commontag.org", "ns"}, 3},
+{{"http:", "semsl.org", "ontology"}, 3}, // semantic web for second life
+{{"http:", "schema.org"}, 2},
+{{"http:", "openelectiondata.org", "0.1"}, 3},
+{{"http:", "search.aol.com", "rdf"}, 3},
+{{"http:", "www.loc.gov", "loc.terms", "relators"}, 4}, // library of congress
+{{"http:", "dbpedia.org", "ontology"}, 3}, // dbo
+{{"http:", "dbpedia.org", "resource"}, 3}, // dbpedia
+{{"http:", "dbpedia.org", "property"}, 3}, // dbp
+{{"http:", "www.aktors.org", "ontology", "portal"}, 4}, // akt (research,
publications, ...)
+{{"http:", "purl.org", "ontology", "bibo"}, 4}, // bibo (bibliography)
+{{"http:", "purl.org", "ontology", "mo"}, 4}, // mo (music)
+{{"http:", "www.geonames.org", "ontology"}, 3}, // geonames
+{{"http:", "purl.org", "vocab", "frbr", "core"}, 5}, // frbr (Functional
Requirements for Bibliographic Records)
+{{"http:", "www.w3.org", "2001", "XMLSchema"}, 4}, // xsd
+{{"http:", "www.w3.org", "2006", "time"}, 4}, // time
+{{"http:", "purl.org", "NET", "c4dm", "event.owl"}, 5}, // event
+{{"http:", "www.openarchives.org", "ore", "terms"}, 4}, // ore (Open Archive)
+{{"http:", "purl.org", "vocab", "bio", "0.1"}, 5}, // bio (biographical data)
+{{"http:", "www.holygoat.co.uk", "owl", "redwood", "0.1", "tags"}, 6}, // tag
+{{"http:", "rdfs.org", "ns", "void"}, 4}, // void (Vocabulary of Interlinked
Datasets)
+{{"http:", "www.w3.org", "2006", "http"}, 4}, // http
+{{"http:", "purl.uniprot.org", "core"}, 3}, // uniprot (protein annotation)
+{{"http:", "umbel.org", "umbel"}, 3}, // umbel (Upper Mapping and Binding
Exchange Layer)
+{{"http:", "purl.org", "stuff", "rev"}, 4}, // rev (review)
+{{"http:", "purl.org", "linked-data", "cube"}, 4}, // qb (data cube)
+{{"http:", "www.w3.org", "ns", "org"}, 4}, // org (organizations)
+{{"http:", "purl.org", "vocab", "vann"}, 4}, // vann (vocabulary for
annotating vocabulary descriptions)
+{{"http:", "data.ordnancesurvey.co.uk", "ontology", "admingeo"}, 4}, //
admingeo (administrative geography and civil voting area)
+{{"http:", "www.w3.org", "2007", "05", "powder-s"}, 5}, // wdrs (Web
Description Resources)
+{{"http:", "usefulinc.com", "ns", "doap"}, 4}, // doap (Description of a
Project)
+{{"http:", "lod.taxonconcept.org", "ontology", "txn.owl"}, 4}, // txn
(TaxonConcept, species)
+{{"http:", "xmlns.com", "wot", "0.1"}, 4}, // wot (Web Of Trust)
+{{"http:", "purl.org", "net", "compass"}, 4}, // compass
+{{"http:", "www.w3.org", "2004", "03", "trix", "rdfg-1"}, 6}, // rdfg (RDF
graph)
+{{"http:", "purl.org", "NET", "c4dm", "timeline.owl"}, 5}, // tl (timeline)
+{{"http:", "purl.org", "dc", "dcam"}, 4}, // dcam (DublinCore metadata)
+{{"http:", "swrc.ontoware.org", "ontology"}, 3}, // swrc (university, research)
+{{"http:", "zeitkunst.org", "bibtex", "0.1", "bibtex.owl"}, 5}, // bib (bibTeX
entries)
+{{"http:", "purl.org", "ontology", "po"}, 4} // po (tv and radio programmes)
+};
+
+#if USE_SHORT_NAMES
+/* Extracts the "human-readable" part of an URI (usually the last token). */
+static
+void getPropNameShort(char* name, char* propStr) {
+ char *token;
+ char uri[1000];
+ int length = 0; // number of tokens
+ char **tokenizedUri = NULL; // list of tokens
+ int i, j;
+ int fit;
+
+ // tokenize uri
+ strcpy(uri, propStr); // uri will be modified during tokenization
+ token = strtok(uri, "/#");
+ while (token != NULL) {
+ tokenizedUri = realloc(tokenizedUri, sizeof(char*) * ++length);
+ if (!tokenizedUri) fprintf(stderr, "ERROR: Couldn't realloc
memory!\n");
+ tokenizedUri[length - 1] = token;
+ token = strtok(NULL, "/#");
+ }
+
+ // match with ontologies
+ for (j = 0; j < ontologyCount; ++j) {
+ if (length > ontologies[j].length) {
+ fit = 1;
+ for (i = 0; fit && i < ontologies[j].length; ++i) {
+ if (strcmp(ontologies[j].uri[i],
tokenizedUri[i]) != 0) {
+ fit = 0;
+ }
+ }
+ if (fit) {
+ // found matching ontology, create label
+ for (i = ontologies[j].length; i < length; ++i)
{
+ strcat(name, tokenizedUri[i]);
+ strcat(name, "_"); // if label consists
of >=2 tokens, use underscores
+ }
+ // remove trailing underscore
+ name[strlen(name) - 1] = '\0';
+
+ free(tokenizedUri);
+ return;
+ }
+ }
+ }
+
+ // no matching ontology found, return content of last token
+
+ if (length == 1) {
+ // value
+ strcat(name, propStr);
+ } else {
+ strcat(name, tokenizedUri[length - 1]);
+ }
+
+ free(tokenizedUri);
+ return;
+}
+#endif
+
+static
+int** initTypeAttributesHistogramCount(int typeAttributesCount, int num) {
+ int i, j;
+ int** typeAttributesHistogramCount;
+
+ typeAttributesHistogramCount = (int **) malloc(sizeof(int *) * num);
+ if (!typeAttributesHistogramCount) fprintf(stderr, "ERROR: Couldn't
malloc memory!\n");
+ for (i = 0; i < num; ++i) {
+ typeAttributesHistogramCount[i] = (int *) malloc(sizeof(int) *
typeAttributesCount);
+ if (!typeAttributesHistogramCount[i]) fprintf(stderr, "ERROR:
Couldn't malloc memory!\n");
+ for (j = 0; j < typeAttributesCount; ++j) {
+ typeAttributesHistogramCount[i][j] = 0;
+ }
+ }
+
+ return typeAttributesHistogramCount;
+}
+
+static
+TypeAttributesFreq*** initTypeAttributesHistogram(int typeAttributesCount, int
** typeAttributesHistogramCount, int num) {
+ int i, j, k;
+ TypeAttributesFreq*** typeAttributesHistogram;
+
+ typeAttributesHistogram = (TypeAttributesFreq ***)
malloc(sizeof(TypeAttributesFreq **) * num);
+ if (!typeAttributesHistogram) fprintf(stderr, "ERROR: Couldn't malloc
memory!\n");
+ for (i = 0; i < num; ++i) {
+ typeAttributesHistogram[i] = (TypeAttributesFreq **) malloc
(sizeof(TypeAttributesFreq *) * typeAttributesCount);
+ if (!typeAttributesHistogram[i]) fprintf(stderr, "ERROR:
Couldn't malloc memory!\n");
+ for (j = 0; j < typeAttributesCount; ++j) {
+ typeAttributesHistogram[i][j] = (TypeAttributesFreq *)
malloc (sizeof(TypeAttributesFreq) * typeAttributesHistogramCount[i][j]);
+ if (!typeAttributesHistogram[i][j]) fprintf(stderr,
"ERROR: Couldn't malloc memory!\n");
+ for (k = 0; k < typeAttributesHistogramCount[i][j];
++k) {
+ typeAttributesHistogram[i][j][k].freq = 0;
+ typeAttributesHistogram[i][j][k].percent = 0;
+ }
+ }
+ }
+
+ return typeAttributesHistogram;
+}
+
+static
+int** initRelationMetadataCount(CSset* freqCSset) {
+ int i, j;
+ int** relationMetadataCount;
+
+ relationMetadataCount = (int **) malloc(sizeof(int *) *
freqCSset->numCSadded);
+ if (!relationMetadataCount) fprintf(stderr, "ERROR: Couldn't malloc
memory!\n");
+ for (i = 0; i < freqCSset->numCSadded; ++i) {
+ relationMetadataCount[i] = NULL;
+ if (freqCSset->items[i].parentFreqIdx != -1) continue; // ignore
+ relationMetadataCount[i] = (int *) malloc(sizeof(int) *
freqCSset->items[i].numProp);
+ if (!relationMetadataCount[i]) fprintf(stderr, "ERROR: Couldn't
malloc memory!\n");
+ for (j = 0; j < freqCSset->items[i].numProp; ++j) {
+ relationMetadataCount[i][j] = 0;
+ }
+ }
+
+ return relationMetadataCount;
+}
+
+/* Calculate frequency per foreign key relationship. */
+static
+Relation*** initRelationMetadata(int** relationMetadataCount, CSmergeRel*
csRelBetweenMergeFreqSet, CSset* freqCSset) {
+ int i, j, k;
+ Relation*** relationMetadata;
+
+ int ret;
+ char* schema = "rdf";
+
+ TKNZRopen (NULL, &schema);
+
+ relationMetadata = (Relation ***) malloc(sizeof(Relation **) *
freqCSset->numCSadded);
+ if (!relationMetadata) fprintf(stderr, "ERROR: Couldn't malloc
memory!\n");
+ for (i = 0; i < freqCSset->numCSadded; ++i) { // CS
+ CS cs = (CS) freqCSset->items[i];
+ if (cs.parentFreqIdx != -1) continue; // ignore
+ relationMetadata[i] = (Relation **) malloc (sizeof(Relation *)
* cs.numProp);
+ if (!relationMetadata[i]) fprintf(stderr, "ERROR: Couldn't
malloc memory!\n");
+ for (j = 0; j < cs.numProp; ++j) { // propNo in CS order
+ int sum = 0;
+ relationMetadataCount[i][j] = 0;
+ relationMetadata[i][j] = NULL;
+ for (k = 0; k < csRelBetweenMergeFreqSet[i].numRef;
++k) { // propNo in CSrel
+
+ if (csRelBetweenMergeFreqSet[i].lstPropId[k] ==
cs.lstProp[j]) {
+ int toId =
csRelBetweenMergeFreqSet[i].lstRefFreqIdx[k];
+ relationMetadataCount[i][j] += 1;
+
+ // alloc/realloc
+ if (relationMetadataCount[i][j] == 1) {
+ // alloc
+ relationMetadata[i][j] =
(Relation *) malloc (sizeof(Relation));
+ if (!relationMetadata[i][j])
fprintf(stderr, "ERROR: Couldn't malloc memory!\n");
+ relationMetadata[i][j][0].to =
toId;
+ relationMetadata[i][j][0].from
= i;
+ relationMetadata[i][j][0].freq
= csRelBetweenMergeFreqSet[i].lstCnt[k];
+
relationMetadata[i][j][0].percent = -1;
+ } else {
+ // realloc
+ relationMetadata[i][j] =
(Relation *) realloc(relationMetadata[i][j], sizeof(Relation) *
relationMetadataCount[i][j]);
+ if (!relationMetadata[i][j])
fprintf(stderr, "ERROR: Couldn't realloc memory!\n");
+
relationMetadata[i][j][relationMetadataCount[i][j] - 1].to = toId;
+
relationMetadata[i][j][relationMetadataCount[i][j] - 1].from = i;
+
relationMetadata[i][j][relationMetadataCount[i][j] - 1].freq =
csRelBetweenMergeFreqSet[i].lstCnt[k];
+
relationMetadata[i][j][relationMetadataCount[i][j] - 1].percent = -1;
+ }
+ }
+ }
+
+ // get total count of values
+ for (k = 0; k < relationMetadataCount[i][j]; ++k) {
+ sum += relationMetadata[i][j][k].freq;
+ }
+ // assign percentage values for every value
+ for (k = 0; k < relationMetadataCount[i][j]; ++k) {
+ relationMetadata[i][j][k].percent = (int)
(100.0 * relationMetadata[i][j][k].freq / sum + 0.5);
+ }
+ }
+ }
_______________________________________________
checkin-list mailing list
[email protected]
http://mail.monetdb.org/mailman/listinfo/checkin-list