[MARMOTTA-621] implement a loader backend for Ostrich, experiment with DBPedia gives arounf 45k triples/sec
Project: http://git-wip-us.apache.org/repos/asf/marmotta/repo Commit: http://git-wip-us.apache.org/repos/asf/marmotta/commit/8f60cf64 Tree: http://git-wip-us.apache.org/repos/asf/marmotta/tree/8f60cf64 Diff: http://git-wip-us.apache.org/repos/asf/marmotta/diff/8f60cf64 Branch: refs/heads/MARMOTTA-584 Commit: 8f60cf64dc8b8e99d89dbf4e54fa81af3635606f Parents: 4eb074d Author: Sebastian Schaffert <[email protected]> Authored: Sun Dec 20 12:40:52 2015 +0100 Committer: Sebastian Schaffert <[email protected]> Committed: Sun Dec 20 12:40:52 2015 +0100 ---------------------------------------------------------------------- libraries/ostrich/backend/CMakeLists.txt | 2 +- .../ostrich/sail/OstrichSailConnection.java | 14 +- loader/marmotta-loader-ostrich/pom.xml | 83 ++++++++++ .../loader/ostrich/OstrichLoaderBackend.java | 100 ++++++++++++ .../loader/ostrich/OstrichLoaderHandler.java | 156 +++++++++++++++++++ ...org.apache.marmotta.loader.api.LoaderBackend | 18 +++ .../src/main/resources/logback.xml | 32 ++++ loader/pom.xml | 10 ++ 8 files changed, 407 insertions(+), 8 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/marmotta/blob/8f60cf64/libraries/ostrich/backend/CMakeLists.txt ---------------------------------------------------------------------- diff --git a/libraries/ostrich/backend/CMakeLists.txt b/libraries/ostrich/backend/CMakeLists.txt index 5a8f110..61156a5 100644 --- a/libraries/ostrich/backend/CMakeLists.txt +++ b/libraries/ostrich/backend/CMakeLists.txt @@ -17,7 +17,7 @@ find_package (GLog REQUIRED) find_package (Boost 1.54.0 COMPONENTS iostreams filesystem system) find_package (Tcmalloc) -#add_definitions(-DNDEBUG) +add_definitions(-DNDEBUG) if (Boost_IOSTREAMS_FOUND) message(STATUS "Enabling gzip/bzip2 support (Boost iostreams found)") http://git-wip-us.apache.org/repos/asf/marmotta/blob/8f60cf64/libraries/ostrich/client/src/main/java/org/apache/marmotta/ostrich/sail/OstrichSailConnection.java ---------------------------------------------------------------------- diff --git a/libraries/ostrich/client/src/main/java/org/apache/marmotta/ostrich/sail/OstrichSailConnection.java b/libraries/ostrich/client/src/main/java/org/apache/marmotta/ostrich/sail/OstrichSailConnection.java index 93e21ac..e85fdd2 100644 --- a/libraries/ostrich/client/src/main/java/org/apache/marmotta/ostrich/sail/OstrichSailConnection.java +++ b/libraries/ostrich/client/src/main/java/org/apache/marmotta/ostrich/sail/OstrichSailConnection.java @@ -107,7 +107,7 @@ public class OstrichSailConnection extends NotifyingSailConnectionBase { @Override protected void addStatementInternal(Resource subj, URI pred, Value obj, Resource... contexts) throws SailException { - log.info("Adding statements."); + log.debug("Adding statements."); ensureTransaction(); if (contexts.length > 0) { @@ -319,7 +319,7 @@ public class OstrichSailConnection extends NotifyingSailConnectionBase { @Override protected void removeStatementsInternal(Resource subj, URI pred, Value obj, Resource... contexts) throws SailException { - log.info("Removing statements."); + log.debug("Removing statements."); commitForQuery(); ensureTransaction(); @@ -338,7 +338,7 @@ public class OstrichSailConnection extends NotifyingSailConnectionBase { @Override protected void clearInternal(Resource... contexts) throws SailException { - log.info("Clearing statements."); + log.debug("Clearing statements."); commitForQuery(); ensureTransaction(); @@ -357,7 +357,7 @@ public class OstrichSailConnection extends NotifyingSailConnectionBase { @Override protected CloseableIteration<? extends Namespace, SailException> getNamespacesInternal() throws SailException { - log.info("Getting namespaces."); + log.debug("Getting namespaces."); commitForQuery(); Empty pattern = Empty.getDefaultInstance(); @@ -382,7 +382,7 @@ public class OstrichSailConnection extends NotifyingSailConnectionBase { @Override protected void setNamespaceInternal(String prefix, String name) throws SailException { - log.info("Setting namespace {} = {}.", prefix, name); + log.debug("Setting namespace {} = {}.", prefix, name); ensureTransaction(); ProtoNamespace ns = new ProtoNamespace(prefix, name); @@ -393,7 +393,7 @@ public class OstrichSailConnection extends NotifyingSailConnectionBase { @Override protected void removeNamespaceInternal(String prefix) throws SailException { - log.info("Removing namespace {}.", prefix); + log.debug("Removing namespace {}.", prefix); commitForQuery(); ensureTransaction(); @@ -404,7 +404,7 @@ public class OstrichSailConnection extends NotifyingSailConnectionBase { @Override protected void clearNamespacesInternal() throws SailException { - log.info("Clearing namespaces."); + log.debug("Clearing namespaces."); commitForQuery(); ensureTransaction(); http://git-wip-us.apache.org/repos/asf/marmotta/blob/8f60cf64/loader/marmotta-loader-ostrich/pom.xml ---------------------------------------------------------------------- diff --git a/loader/marmotta-loader-ostrich/pom.xml b/loader/marmotta-loader-ostrich/pom.xml new file mode 100644 index 0000000..5816b68 --- /dev/null +++ b/loader/marmotta-loader-ostrich/pom.xml @@ -0,0 +1,83 @@ +<?xml version="1.0" encoding="UTF-8"?> +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + + <parent> + <groupId>org.apache.marmotta</groupId> + <artifactId>marmotta-parent</artifactId> + <version>3.4.0-SNAPSHOT</version> + <relativePath>../../parent</relativePath> + </parent> + + <artifactId>marmotta-loader-ostrich</artifactId> + <name>Loader: Ostrich Backend</name> + + <description> + Apache Marmotta bulk loader backend for loading large datasets into a Ostrich/LevelDB backend. + </description> + + <build> + <plugins> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-shade-plugin</artifactId> + <version>2.2</version> + <executions> + <execution> + <phase>package</phase> + <goals> + <goal>shade</goal> + </goals> + <configuration> + <createDependencyReducedPom>false</createDependencyReducedPom> + <transformers> + <transformer implementation="org.apache.maven.plugins.shade.resource.ApacheLicenseResourceTransformer" /> + <transformer implementation="org.apache.maven.plugins.shade.resource.ManifestResourceTransformer"> + <mainClass>org.apache.marmotta.loader.core.MarmottaLoader</mainClass> + </transformer> + <transformer implementation="org.apache.maven.plugins.shade.resource.ServicesResourceTransformer" /> + <transformer implementation="org.apache.maven.plugins.shade.resource.ApacheNoticeResourceTransformer"> + <addHeader>false</addHeader> + </transformer> + </transformers> + </configuration> + </execution> + </executions> + </plugin> + </plugins> + </build> + + <dependencies> + <dependency> + <groupId>org.apache.marmotta</groupId> + <artifactId>marmotta-loader-core</artifactId> + <version>${project.version}</version> + </dependency> + <dependency> + <groupId>org.apache.marmotta</groupId> + <artifactId>ostrich-model</artifactId> + <version>${project.version}</version> + </dependency> + <dependency> + <groupId>org.apache.marmotta</groupId> + <artifactId>ostrich-client</artifactId> + <version>${project.version}</version> + </dependency> + </dependencies> +</project> http://git-wip-us.apache.org/repos/asf/marmotta/blob/8f60cf64/loader/marmotta-loader-ostrich/src/main/java/org/apache/marmotta/loader/ostrich/OstrichLoaderBackend.java ---------------------------------------------------------------------- diff --git a/loader/marmotta-loader-ostrich/src/main/java/org/apache/marmotta/loader/ostrich/OstrichLoaderBackend.java b/loader/marmotta-loader-ostrich/src/main/java/org/apache/marmotta/loader/ostrich/OstrichLoaderBackend.java new file mode 100644 index 0000000..bd6e49e --- /dev/null +++ b/loader/marmotta-loader-ostrich/src/main/java/org/apache/marmotta/loader/ostrich/OstrichLoaderBackend.java @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.marmotta.loader.ostrich; + +import org.apache.commons.cli.Option; +import org.apache.commons.cli.OptionBuilder; +import org.apache.commons.configuration.Configuration; +import org.apache.marmotta.loader.api.LoaderBackend; +import org.apache.marmotta.loader.api.LoaderHandler; + +import java.util.Collection; +import java.util.HashSet; +import java.util.Set; + +/** + * Ostrich loader backend. Provides configuration for the OstrichLoaderHandler. + * + * @author Sebastian Schaffert ([email protected]) + */ +public class OstrichLoaderBackend implements LoaderBackend { + + /** + * Create the RDFHandler to be used for bulk-loading, optionally using the configuration passed as argument. + * + * @param configuration + * @return a newly created RDFHandler instance + */ + @Override + public LoaderHandler createLoader(Configuration configuration) { + return new OstrichLoaderHandler( + configuration.getString("backend.ostrich.host", "localhost"), + configuration.getInt("backend.ostrich.port", 10000), + configuration.getLong("backend.ostrich.batchsize", 500000)); + } + + /** + * Return a unique identifier for the loader; used for identifying the loader to choose on the command line + * in case more than one loader implementation is available. + * <p/> + * Should match with the regular expression [a-z][a-z0-9]* + * + * @return + */ + @Override + public String getIdentifier() { + return "ostrich"; + } + + /** + * Return any additional options that this backend offers (e.g. for connecting to a database etc). + * If there are no additional options, return an empty collection. + * + * @return + */ + @Override + public Collection<Option> getOptions() { + Set<Option> options = new HashSet<>(); + + Option host = + OptionBuilder.withArgName("host") + .hasArgs(1) + .withDescription("hostname or IP address of Ostrich/LevelDB server") + .withLongOpt("host") + .create('H'); + options.add(host); + + Option port = + OptionBuilder.withArgName("port") + .hasArgs(1) + .withDescription("port used by Ostrich/LevelDB server") + .withLongOpt("port") + .create('P'); + options.add(port); + + Option batchSize = + OptionBuilder.withArgName("batchsize") + .hasArgs(1) + .withDescription("maximum number of statements to commit in one batch") + .withLongOpt("batchsize") + .create('B'); + options.add(batchSize); + + return options; + } +} http://git-wip-us.apache.org/repos/asf/marmotta/blob/8f60cf64/loader/marmotta-loader-ostrich/src/main/java/org/apache/marmotta/loader/ostrich/OstrichLoaderHandler.java ---------------------------------------------------------------------- diff --git a/loader/marmotta-loader-ostrich/src/main/java/org/apache/marmotta/loader/ostrich/OstrichLoaderHandler.java b/loader/marmotta-loader-ostrich/src/main/java/org/apache/marmotta/loader/ostrich/OstrichLoaderHandler.java new file mode 100644 index 0000000..675d25a --- /dev/null +++ b/loader/marmotta-loader-ostrich/src/main/java/org/apache/marmotta/loader/ostrich/OstrichLoaderHandler.java @@ -0,0 +1,156 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.marmotta.loader.ostrich; + +import org.apache.marmotta.loader.api.LoaderHandler; +import org.apache.marmotta.ostrich.sail.OstrichSail; +import org.openrdf.model.Statement; +import org.openrdf.rio.RDFHandlerException; +import org.openrdf.sail.SailConnection; +import org.openrdf.sail.SailException; + +/** + * Add file description here! + * + * @author Sebastian Schaffert ([email protected]) + */ +public class OstrichLoaderHandler implements LoaderHandler { + + private OstrichSail sail; + private SailConnection con; + + long count = 0; + long batchSize = 500000; + + public OstrichLoaderHandler(String host, int port, long batchSize) { + this.batchSize = batchSize; + this.sail = new OstrichSail(host,port); + } + + /** + * Initialise the handler, performing any initialisation steps that are necessary before bulk importing can + * start (e.g. dropping indexes or establishing a connection). + * + * @throws RDFHandlerException + */ + @Override + public void initialise() throws RDFHandlerException { + try { + sail.initialize(); + con = sail.getConnection(); + } catch (SailException e) { + throw new RDFHandlerException("Could not establish Ostrich connection", e); + } + } + + /** + * Peform cleanup on shutdown, e.g. re-creating indexes after import completed or freeing resources acquired by + * the handler. + */ + @Override + public void shutdown() throws RDFHandlerException { + try { + con.close(); + sail.shutDown(); + } catch (SailException e) { + throw new RDFHandlerException("Could not close Ostrich connection", e); + } + + } + + /** + * Signals the start of the RDF data. This method is called before any data + * is reported. + * + * @throws RDFHandlerException If the RDF handler has encountered an unrecoverable error. + */ + @Override + public void startRDF() throws RDFHandlerException { + try { + con.begin(); + } catch (SailException e) { + throw new RDFHandlerException("Could not start transaction", e); + } + } + + /** + * Signals the end of the RDF data. This method is called when all data has + * been reported. + * + * @throws RDFHandlerException If the RDF handler has encountered an unrecoverable error. + */ + @Override + public void endRDF() throws RDFHandlerException { + try { + con.commit(); + } catch (SailException e) { + throw new RDFHandlerException("Could not commit transaction", e); + } + } + + /** + * Handles a namespace declaration/definition. A namespace declaration + * associates a (short) prefix string with the namespace's URI. The prefix + * for default namespaces, which do not have an associated prefix, are + * represented as empty strings. + * + * @param prefix The prefix for the namespace, or an empty string in case of a + * default namespace. + * @param uri The URI that the prefix maps to. + * @throws RDFHandlerException If the RDF handler has encountered an unrecoverable error. + */ + @Override + public void handleNamespace(String prefix, String uri) throws RDFHandlerException { + try { + con.setNamespace(prefix, uri); + } catch (SailException e) { + throw new RDFHandlerException("Could not add namespace", e); + } + } + + /** + * Handles a statement. + * + * @param st The statement. + * @throws RDFHandlerException If the RDF handler has encountered an unrecoverable error. + */ + @Override + public void handleStatement(Statement st) throws RDFHandlerException { + try { + con.addStatement(st.getSubject(), st.getPredicate(), st.getObject(), st.getContext()); + + if (++count % batchSize == 0) { + con.commit(); + con.begin(); + } + } catch (SailException e) { + throw new RDFHandlerException("Could not add statement", e); + } + } + + /** + * Handles a comment. + * + * @param comment The comment. + * @throws RDFHandlerException If the RDF handler has encountered an unrecoverable error. + */ + @Override + public void handleComment(String comment) throws RDFHandlerException { + + } +} http://git-wip-us.apache.org/repos/asf/marmotta/blob/8f60cf64/loader/marmotta-loader-ostrich/src/main/resources/META-INF/services/org.apache.marmotta.loader.api.LoaderBackend ---------------------------------------------------------------------- diff --git a/loader/marmotta-loader-ostrich/src/main/resources/META-INF/services/org.apache.marmotta.loader.api.LoaderBackend b/loader/marmotta-loader-ostrich/src/main/resources/META-INF/services/org.apache.marmotta.loader.api.LoaderBackend new file mode 100644 index 0000000..ca1e9a1 --- /dev/null +++ b/loader/marmotta-loader-ostrich/src/main/resources/META-INF/services/org.apache.marmotta.loader.api.LoaderBackend @@ -0,0 +1,18 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +org.apache.marmotta.loader.ostrich.OstrichLoaderBackend \ No newline at end of file http://git-wip-us.apache.org/repos/asf/marmotta/blob/8f60cf64/loader/marmotta-loader-ostrich/src/main/resources/logback.xml ---------------------------------------------------------------------- diff --git a/loader/marmotta-loader-ostrich/src/main/resources/logback.xml b/loader/marmotta-loader-ostrich/src/main/resources/logback.xml new file mode 100644 index 0000000..ebc7937 --- /dev/null +++ b/loader/marmotta-loader-ostrich/src/main/resources/logback.xml @@ -0,0 +1,32 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one or more + ~ contributor license agreements. See the NOTICE file distributed with + ~ this work for additional information regarding copyright ownership. + ~ The ASF licenses this file to You under the Apache License, Version 2.0 + ~ (the "License"); you may not use this file except in compliance with + ~ the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<configuration> + <appender name="CONSOLE" class="ch.qos.logback.core.ConsoleAppender"> + <encoder> + <pattern>%d{HH:mm:ss.SSS} %level %logger{15} - %m%n</pattern> + </encoder> + </appender> + + <logger name="org.apache.zookeeper" level="WARN" /> + <logger name="org.apache.hadoop" level="WARN" /> + + + <root level="${root-level:-INFO}"> + <appender-ref ref="CONSOLE"/> + </root> +</configuration> http://git-wip-us.apache.org/repos/asf/marmotta/blob/8f60cf64/loader/pom.xml ---------------------------------------------------------------------- diff --git a/loader/pom.xml b/loader/pom.xml index dbdc0f1..4771f28 100644 --- a/loader/pom.xml +++ b/loader/pom.xml @@ -92,4 +92,14 @@ <module>marmotta-loader-berkeley</module> </modules> + <profiles> + <profile> + <id>ostrich</id> + <modules> + <module>marmotta-loader-ostrich</module> + </modules> + </profile> + </profiles> + + </project>
