http://git-wip-us.apache.org/repos/asf/jena/blob/05c389be/hadoop-rdf/hadoop-rdf-mapreduce/src/test/java/org/apache/jena/hadoop/rdf/mapreduce/transform/QuadsToTriplesMapperTest.java ---------------------------------------------------------------------- diff --git a/hadoop-rdf/hadoop-rdf-mapreduce/src/test/java/org/apache/jena/hadoop/rdf/mapreduce/transform/QuadsToTriplesMapperTest.java b/hadoop-rdf/hadoop-rdf-mapreduce/src/test/java/org/apache/jena/hadoop/rdf/mapreduce/transform/QuadsToTriplesMapperTest.java new file mode 100644 index 0000000..51b29cb --- /dev/null +++ b/hadoop-rdf/hadoop-rdf-mapreduce/src/test/java/org/apache/jena/hadoop/rdf/mapreduce/transform/QuadsToTriplesMapperTest.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.jena.hadoop.rdf.mapreduce.transform; + +import java.io.IOException; + +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.mapreduce.Mapper; +import org.apache.hadoop.mrunit.mapreduce.MapDriver; +import org.apache.hadoop.mrunit.types.Pair; +import org.apache.jena.hadoop.rdf.mapreduce.AbstractMapperTests; +import org.apache.jena.hadoop.rdf.mapreduce.transform.QuadsToTriplesMapper; +import org.apache.jena.hadoop.rdf.types.QuadWritable; +import org.apache.jena.hadoop.rdf.types.TripleWritable; +import org.junit.Test; + +import com.hp.hpl.jena.datatypes.xsd.XSDDatatype; +import com.hp.hpl.jena.graph.NodeFactory; +import com.hp.hpl.jena.graph.Triple; +import com.hp.hpl.jena.sparql.core.Quad; + +/** + * Tests for the {@link QuadsToTriplesMapper} + * + * + * + */ +public class QuadsToTriplesMapperTest extends AbstractMapperTests<LongWritable, QuadWritable, LongWritable, TripleWritable> { + + @Override + protected Mapper<LongWritable, QuadWritable, LongWritable, TripleWritable> getInstance() { + return new QuadsToTriplesMapper<LongWritable>(); + } + + protected void generateData(MapDriver<LongWritable, QuadWritable, LongWritable, TripleWritable> driver, int num) { + for (int i = 0; i < num; i++) { + Triple t = new Triple(NodeFactory.createURI("http://subjects/" + i), NodeFactory.createURI("http://predicate"), + NodeFactory.createLiteral(Integer.toString(i), XSDDatatype.XSDinteger)); + Quad q = new Quad(Quad.defaultGraphNodeGenerated, t); + driver.addInput(new LongWritable(i), new QuadWritable(q)); + driver.addOutput(new LongWritable(i), new TripleWritable(t)); + } + } + + /** + * Tests quads to triples conversion + * + * @throws IOException + */ + @Test + public void quads_to_triples_mapper_01() throws IOException { + MapDriver<LongWritable, QuadWritable, LongWritable, TripleWritable> driver = this.getMapDriver(); + + Triple t = new Triple(NodeFactory.createURI("http://s"), NodeFactory.createURI("http://p"), + NodeFactory.createLiteral("test")); + Quad q = new Quad(Quad.defaultGraphNodeGenerated, t); + driver.withInput(new Pair<LongWritable, QuadWritable>(new LongWritable(1), new QuadWritable(q))).withOutput( + new Pair<LongWritable, TripleWritable>(new LongWritable(1), new TripleWritable(t))); + driver.runTest(); + } + + /** + * Tests quads to triples conversion + * + * @throws IOException + */ + @Test + public void quads_to_triples_mapper_02() throws IOException { + MapDriver<LongWritable, QuadWritable, LongWritable, TripleWritable> driver = this.getMapDriver(); + this.generateData(driver, 100); + driver.runTest(); + } + + /** + * Tests quads to triples conversion + * + * @throws IOException + */ + @Test + public void quads_to_triples_mapper_03() throws IOException { + MapDriver<LongWritable, QuadWritable, LongWritable, TripleWritable> driver = this.getMapDriver(); + this.generateData(driver, 1000); + driver.runTest(); + } + + /** + * Tests quads to triples conversion + * + * @throws IOException + */ + @Test + public void quads_to_triples_mapper_04() throws IOException { + MapDriver<LongWritable, QuadWritable, LongWritable, TripleWritable> driver = this.getMapDriver(); + this.generateData(driver, 10000); + driver.runTest(); + } +}
http://git-wip-us.apache.org/repos/asf/jena/blob/05c389be/hadoop-rdf/hadoop-rdf-mapreduce/src/test/java/org/apache/jena/hadoop/rdf/mapreduce/transform/TriplesToQuadsBySubjectMapperTest.java ---------------------------------------------------------------------- diff --git a/hadoop-rdf/hadoop-rdf-mapreduce/src/test/java/org/apache/jena/hadoop/rdf/mapreduce/transform/TriplesToQuadsBySubjectMapperTest.java b/hadoop-rdf/hadoop-rdf-mapreduce/src/test/java/org/apache/jena/hadoop/rdf/mapreduce/transform/TriplesToQuadsBySubjectMapperTest.java new file mode 100644 index 0000000..bdf39f5 --- /dev/null +++ b/hadoop-rdf/hadoop-rdf-mapreduce/src/test/java/org/apache/jena/hadoop/rdf/mapreduce/transform/TriplesToQuadsBySubjectMapperTest.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.jena.hadoop.rdf.mapreduce.transform; + +import java.io.IOException; + +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.mapreduce.Mapper; +import org.apache.hadoop.mrunit.mapreduce.MapDriver; +import org.apache.hadoop.mrunit.types.Pair; +import org.apache.jena.hadoop.rdf.mapreduce.AbstractMapperTests; +import org.apache.jena.hadoop.rdf.mapreduce.transform.TriplesToQuadsBySubjectMapper; +import org.apache.jena.hadoop.rdf.types.QuadWritable; +import org.apache.jena.hadoop.rdf.types.TripleWritable; +import org.junit.Test; + +import com.hp.hpl.jena.datatypes.xsd.XSDDatatype; +import com.hp.hpl.jena.graph.NodeFactory; +import com.hp.hpl.jena.graph.Triple; +import com.hp.hpl.jena.sparql.core.Quad; + +/** + * Tests for the {@link TriplesToQuadsBySubjectMapper} + * + * + * + */ +public class TriplesToQuadsBySubjectMapperTest extends AbstractMapperTests<LongWritable, TripleWritable, LongWritable, QuadWritable> { + + @Override + protected Mapper<LongWritable, TripleWritable, LongWritable, QuadWritable> getInstance() { + return new TriplesToQuadsBySubjectMapper<LongWritable>(); + } + + protected void generateData(MapDriver<LongWritable, TripleWritable, LongWritable, QuadWritable> driver, int num) { + for (int i = 0; i < num; i++) { + Triple t = new Triple(NodeFactory.createURI("http://subjects/" + i), NodeFactory.createURI("http://predicate"), + NodeFactory.createLiteral(Integer.toString(i), XSDDatatype.XSDinteger)); + Quad q = new Quad(t.getSubject(), t); + driver.addInput(new LongWritable(i), new TripleWritable(t)); + driver.addOutput(new LongWritable(i), new QuadWritable(q)); + } + } + + /** + * Tests quads to triples conversion + * + * @throws IOException + */ + @Test + public void triples_to_quads_mapper_01() throws IOException { + MapDriver<LongWritable, TripleWritable, LongWritable, QuadWritable> driver = this.getMapDriver(); + + Triple t = new Triple(NodeFactory.createURI("http://s"), NodeFactory.createURI("http://p"), + NodeFactory.createLiteral("test")); + Quad q = new Quad(t.getSubject(), t); + driver.withInput(new Pair<LongWritable, TripleWritable>(new LongWritable(1), new TripleWritable(t))).withOutput( + new Pair<LongWritable, QuadWritable>(new LongWritable(1), new QuadWritable(q))); + driver.runTest(); + } + + /** + * Tests quads to triples conversion + * + * @throws IOException + */ + @Test + public void triples_to_quads_mapper_02() throws IOException { + MapDriver<LongWritable, TripleWritable, LongWritable, QuadWritable> driver = this.getMapDriver(); + this.generateData(driver, 100); + driver.runTest(); + } + + /** + * Tests quads to triples conversion + * + * @throws IOException + */ + @Test + public void triples_to_quads_mapper_03() throws IOException { + MapDriver<LongWritable, TripleWritable, LongWritable, QuadWritable> driver = this.getMapDriver(); + this.generateData(driver, 1000); + driver.runTest(); + } + + /** + * Tests quads to triples conversion + * + * @throws IOException + */ + @Test + public void triples_to_quads_mapper_04() throws IOException { + MapDriver<LongWritable, TripleWritable, LongWritable, QuadWritable> driver = this.getMapDriver(); + this.generateData(driver, 10000); + driver.runTest(); + } +} http://git-wip-us.apache.org/repos/asf/jena/blob/05c389be/hadoop-rdf/hadoop-rdf-mapreduce/src/test/java/org/apache/jena/hadoop/rdf/mapreduce/transform/TriplesToQuadsConstantGraphMapperTest.java ---------------------------------------------------------------------- diff --git a/hadoop-rdf/hadoop-rdf-mapreduce/src/test/java/org/apache/jena/hadoop/rdf/mapreduce/transform/TriplesToQuadsConstantGraphMapperTest.java b/hadoop-rdf/hadoop-rdf-mapreduce/src/test/java/org/apache/jena/hadoop/rdf/mapreduce/transform/TriplesToQuadsConstantGraphMapperTest.java new file mode 100644 index 0000000..b82f74b --- /dev/null +++ b/hadoop-rdf/hadoop-rdf-mapreduce/src/test/java/org/apache/jena/hadoop/rdf/mapreduce/transform/TriplesToQuadsConstantGraphMapperTest.java @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.jena.hadoop.rdf.mapreduce.transform; + +import java.io.IOException; + +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.mapreduce.Mapper; +import org.apache.hadoop.mrunit.mapreduce.MapDriver; +import org.apache.hadoop.mrunit.types.Pair; +import org.apache.jena.hadoop.rdf.mapreduce.AbstractMapperTests; +import org.apache.jena.hadoop.rdf.mapreduce.transform.TriplesToQuadsConstantGraphMapper; +import org.apache.jena.hadoop.rdf.types.QuadWritable; +import org.apache.jena.hadoop.rdf.types.TripleWritable; +import org.junit.Test; + +import com.hp.hpl.jena.datatypes.xsd.XSDDatatype; +import com.hp.hpl.jena.graph.NodeFactory; +import com.hp.hpl.jena.graph.Triple; +import com.hp.hpl.jena.sparql.core.Quad; + +/** + * Tests for the {@link TriplesToQuadsConstantGraphMapper} + * + * + * + */ +public class TriplesToQuadsConstantGraphMapperTest extends AbstractMapperTests<LongWritable, TripleWritable, LongWritable, QuadWritable> { + + @Override + protected Mapper<LongWritable, TripleWritable, LongWritable, QuadWritable> getInstance() { + return new TriplesToQuadsConstantGraphMapper<LongWritable>(); + } + + protected void generateData(MapDriver<LongWritable, TripleWritable, LongWritable, QuadWritable> driver, int num) { + for (int i = 0; i < num; i++) { + Triple t = new Triple(NodeFactory.createURI("http://subjects/" + i), NodeFactory.createURI("http://predicate"), + NodeFactory.createLiteral(Integer.toString(i), XSDDatatype.XSDinteger)); + Quad q = new Quad(Quad.defaultGraphNodeGenerated, t); + driver.addInput(new LongWritable(i), new TripleWritable(t)); + driver.addOutput(new LongWritable(i), new QuadWritable(q)); + } + } + + /** + * Tests quads to triples conversion + * + * @throws IOException + */ + @Test + public void triples_to_quads_mapper_01() throws IOException { + MapDriver<LongWritable, TripleWritable, LongWritable, QuadWritable> driver = this.getMapDriver(); + + Triple t = new Triple(NodeFactory.createURI("http://s"), NodeFactory.createURI("http://p"), + NodeFactory.createLiteral("test")); + Quad q = new Quad(Quad.defaultGraphNodeGenerated, t); + driver.withInput(new Pair<LongWritable, TripleWritable>(new LongWritable(1), new TripleWritable(t))).withOutput( + new Pair<LongWritable, QuadWritable>(new LongWritable(1), new QuadWritable(q))); + driver.runTest(); + } + + /** + * Tests quads to triples conversion + * + * @throws IOException + */ + @Test + public void triples_to_quads_mapper_02() throws IOException { + MapDriver<LongWritable, TripleWritable, LongWritable, QuadWritable> driver = this.getMapDriver(); + this.generateData(driver, 100); + driver.runTest(); + } + + /** + * Tests quads to triples conversion + * + * @throws IOException + */ + @Test + public void triples_to_quads_mapper_03() throws IOException { + MapDriver<LongWritable, TripleWritable, LongWritable, QuadWritable> driver = this.getMapDriver(); + this.generateData(driver, 1000); + driver.runTest(); + } + + /** + * Tests quads to triples conversion + * + * @throws IOException + */ + @Test + public void triples_to_quads_mapper_04() throws IOException { + MapDriver<LongWritable, TripleWritable, LongWritable, QuadWritable> driver = this.getMapDriver(); + this.generateData(driver, 10000); + driver.runTest(); + } +} http://git-wip-us.apache.org/repos/asf/jena/blob/05c389be/hadoop-rdf/hadoop-rdf-stats/hadoop-job.xml ---------------------------------------------------------------------- diff --git a/hadoop-rdf/hadoop-rdf-stats/hadoop-job.xml b/hadoop-rdf/hadoop-rdf-stats/hadoop-job.xml new file mode 100644 index 0000000..de72645 --- /dev/null +++ b/hadoop-rdf/hadoop-rdf-stats/hadoop-job.xml @@ -0,0 +1,46 @@ +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<assembly> + <id>hadoop-job</id> + <formats> + <format>jar</format> + </formats> + <includeBaseDirectory>false</includeBaseDirectory> + <dependencySets> + <dependencySet> + <unpack>false</unpack> + <scope>runtime</scope> + <outputDirectory>lib</outputDirectory> + <excludes> + <exclude>${groupId}:${artifactId}</exclude> + </excludes> + </dependencySet> + <dependencySet> + <unpack>true</unpack> + <includes> + <include>${groupId}:${artifactId}</include> + </includes> + </dependencySet> + </dependencySets> + <fileSets> + <fileSet> + <directory>${basedir}/target/test-classes</directory> + <outputDirectory>/</outputDirectory> + </fileSet> + </fileSets> +</assembly> http://git-wip-us.apache.org/repos/asf/jena/blob/05c389be/hadoop-rdf/hadoop-rdf-stats/pom.xml ---------------------------------------------------------------------- diff --git a/hadoop-rdf/hadoop-rdf-stats/pom.xml b/hadoop-rdf/hadoop-rdf-stats/pom.xml new file mode 100644 index 0000000..bf69fa6 --- /dev/null +++ b/hadoop-rdf/hadoop-rdf-stats/pom.xml @@ -0,0 +1,103 @@ +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + <parent> + <groupId>org.apache.jena</groupId> + <artifactId>jena-hadoop-rdf</artifactId> + <version>0.9.0-SNAPSHOT</version> + </parent> + <artifactId>jena-hadoop-rdf-stats</artifactId> + <name>Apache Jena - RDF Tools for Hadoop - Statistics Demo App</name> + <description>A demo application that can be run on Hadoop to produce a statistical analysis on arbitrary RDF inputs</description> + + <dependencies> + <!-- Internal Project Dependencies --> + <dependency> + <groupId>org.apache.jena</groupId> + <artifactId>jena-hadoop-rdf-io</artifactId> + <version>${project.version}</version> + </dependency> + <dependency> + <groupId>org.apache.jena</groupId> + <artifactId>jena-hadoop-rdf-mapreduce</artifactId> + <version>${project.version}</version> + </dependency> + + <!-- CLI related Dependencies --> + <dependency> + <groupId>io.airlift</groupId> + <artifactId>airline</artifactId> + <version>0.6</version> + </dependency> + + <!-- Hadoop Dependencies --> + <!-- Note these will be provided on the Hadoop cluster hence the provided + scope --> + <dependency> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-common</artifactId> + <scope>provided</scope> + </dependency> + <dependency> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-mapreduce-client-common</artifactId> + <scope>provided</scope> + </dependency> + + <!-- Test Dependencies --> + <dependency> + <groupId>org.apache.jena</groupId> + <artifactId>jena-hadoop-rdf-mapreduce</artifactId> + <version>${project.version}</version> + <classifier>tests</classifier> + <scope>test</scope> + </dependency> + <dependency> + <groupId>org.apache.mrunit</groupId> + <artifactId>mrunit</artifactId> + <scope>test</scope> + <classifier>hadoop2</classifier> + </dependency> + </dependencies> + + <build> + <plugins> + <!-- Assembly plugin is used to produce the runnable Hadoop JAR with all + dependencies contained therein --> + <plugin> + <artifactId>maven-assembly-plugin</artifactId> + <configuration> + <descriptors> + <descriptor>hadoop-job.xml</descriptor> + </descriptors> + </configuration> + <executions> + <execution> + <id>make-assembly</id> + <phase>package</phase> + <goals> + <goal>single</goal> + </goals> + </execution> + </executions> + </plugin> + </plugins> + </build> +</project> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/jena/blob/05c389be/hadoop-rdf/hadoop-rdf-stats/src/main/java/org/apache/jena/hadoop/rdf/stats/RdfStats.java ---------------------------------------------------------------------- diff --git a/hadoop-rdf/hadoop-rdf-stats/src/main/java/org/apache/jena/hadoop/rdf/stats/RdfStats.java b/hadoop-rdf/hadoop-rdf-stats/src/main/java/org/apache/jena/hadoop/rdf/stats/RdfStats.java new file mode 100644 index 0000000..917176e --- /dev/null +++ b/hadoop-rdf/hadoop-rdf-stats/src/main/java/org/apache/jena/hadoop/rdf/stats/RdfStats.java @@ -0,0 +1,408 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.jena.hadoop.rdf.stats; + +import io.airlift.command.Arguments; +import io.airlift.command.Command; +import io.airlift.command.Help; +import io.airlift.command.HelpOption; +import io.airlift.command.Option; +import io.airlift.command.OptionType; +import io.airlift.command.ParseArgumentsMissingException; +import io.airlift.command.ParseArgumentsUnexpectedException; +import io.airlift.command.ParseException; +import io.airlift.command.ParseOptionMissingException; +import io.airlift.command.ParseOptionMissingValueException; +import io.airlift.command.SingleCommand; +import io.airlift.command.model.CommandMetadata; + +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; +import java.util.concurrent.TimeUnit; + +import javax.inject.Inject; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.util.Tool; +import org.apache.hadoop.util.ToolRunner; +import org.apache.jena.hadoop.rdf.stats.jobs.JobFactory; + + +/** + * Entry point for the Hadoop job, handles launching all the relevant Hadoop + * jobs + * + * + * + */ +@Command(name = "bin/hadoop jar PATH_TO_JAR com.yarcdata.urika.hadoop.rdf.stats.RdfStats", description = "A command which computes statistics on RDF data using Hadoop") +public class RdfStats implements Tool { + + static final String ANSI_RED = "\u001B[31m"; + static final String ANSI_RESET = "\u001B[0m"; + + private static final String DATA_TYPE_TRIPLES = "triples", DATA_TYPE_QUADS = "quads", DATA_TYPE_MIXED = "mixed"; + + /** + * Help option + */ + @Inject + public HelpOption helpOption; + + /** + * Gets/Sets whether all available statistics will be calculated + */ + @Option(name = { "-a", "--all" }, description = "Requests that all available statistics be calculated", type = OptionType.COMMAND) + public boolean all = false; + + /** + * Gets/Sets whether node usage counts will be calculated + */ + @Option(name = { "-n", "--node-count" }, description = "Requests that node usage counts be calculated", type = OptionType.COMMAND) + public boolean nodeCount = false; + + /** + * Gets/Sets whether characteristic sets will be calculated + */ + @Option(name = { "-c", "--characteristic-sets" }, description = "Requests that characteristic sets be calculated", type = OptionType.COMMAND) + public boolean characteristicSets = false; + + /** + * Gets/Sets whether type counts will be calculated + */ + @Option(name = { "-t", "--type-counts" }, description = "Requests that rdf:type usage counts be calculated", type = OptionType.COMMAND) + public boolean typeCount = false; + + /** + * Gets/Sets whether data type counts will be calculated + */ + @Option(name = { "-d", "--data-types" }, description = "Requests that literal data type usage counts be calculated", type = OptionType.COMMAND) + public boolean dataTypeCount = false; + + /** + * Gets/Sets whether namespace counts will be calculated + */ + @Option(name = { "--namespaces" }, description = "Requests that namespace usage counts be calculated", type = OptionType.COMMAND) + public boolean namespaceCount = false; + + /** + * Gets/Sets the input data type used + */ + @Option(name = { "--input-type" }, allowedValues = { DATA_TYPE_MIXED, DATA_TYPE_QUADS, DATA_TYPE_TRIPLES }, description = "Specifies whether the input data is a mixture of quads and triples, just quads or just triples. Using the most specific data type will yield the most accurrate statistics") + public String inputType = DATA_TYPE_MIXED; + + /** + * Gets/Sets the output path + */ + @Option(name = { "-o", "--output" }, title = "OutputPath", description = "Sets the output path", arity = 1, required = true) + public String outputPath = null; + + /** + * Gets/Sets the input path(s) + */ + @Arguments(description = "Sets the input path(s)", title = "InputPath", required = true) + public List<String> inputPaths = new ArrayList<String>(); + + private Configuration config; + + /** + * Entry point method + * + * @param args + * Arguments + * @throws Exception + */ + public static void main(String[] args) throws Exception { + try { + // Run and exit with result code if no errors bubble up + // Note that the exit code may still be a error code + int res = ToolRunner.run(new Configuration(true), new RdfStats(), args); + System.exit(res); + } catch (Exception e) { + System.err.println(ANSI_RED + e.getMessage()); + e.printStackTrace(System.err); + } finally { + System.err.print(ANSI_RESET); + } + // If any errors bubble up exit with non-zero code + System.exit(1); + } + + private static void showUsage() { + CommandMetadata metadata = SingleCommand.singleCommand(RdfStats.class).getCommandMetadata(); + StringBuilder builder = new StringBuilder(); + Help.help(metadata, builder); + System.err.print(ANSI_RESET); + System.err.println(builder.toString()); + System.exit(1); + } + + @Override + public void setConf(Configuration conf) { + this.config = conf; + } + + @Override + public Configuration getConf() { + return this.config; + } + + @Override + public int run(String[] args) throws Exception { + try { + // Parse custom arguments + RdfStats cmd = SingleCommand.singleCommand(RdfStats.class).parse(args); + + // Copy Hadoop configuration across + cmd.setConf(this.getConf()); + + // Show help if requested and exit with success + if (cmd.helpOption.showHelpIfRequested()) { + return 0; + } + + // Run the command and exit with success + cmd.run(); + return 0; + + } catch (ParseOptionMissingException e) { + System.err.println(ANSI_RED + e.getMessage()); + System.err.println(); + showUsage(); + } catch (ParseOptionMissingValueException e) { + System.err.println(ANSI_RED + e.getMessage()); + System.err.println(); + showUsage(); + } catch (ParseArgumentsMissingException e) { + System.err.println(ANSI_RED + e.getMessage()); + System.err.println(); + showUsage(); + } catch (ParseArgumentsUnexpectedException e) { + System.err.println(ANSI_RED + e.getMessage()); + System.err.println(); + showUsage(); + // TODO Re-enable as and when we upgrade Airline + // } catch (ParseOptionIllegalValueException e) { + // System.err.println(ANSI_RED + e.getMessage()); + // System.err.println(); + // showUsage(); + } catch (ParseException e) { + System.err.println(ANSI_RED + e.getMessage()); + System.err.println(); + showUsage(); + } catch (UnsupportedOperationException e) { + System.err.println(ANSI_RED + e.getMessage()); + } catch (Throwable e) { + System.err.println(ANSI_RED + e.getMessage()); + e.printStackTrace(System.err); + } finally { + System.err.print(ANSI_RESET); + } + return 1; + } + + private void run() throws Throwable { + if (!this.outputPath.endsWith("/")) { + this.outputPath += "/"; + } + + // If all statistics requested turn on all statistics + if (this.all) { + this.nodeCount = true; + this.characteristicSets = true; + this.typeCount = true; + this.dataTypeCount = true; + this.namespaceCount = true; + } + + // How many statistics were requested? + int statsRequested = 0; + if (this.nodeCount) + statsRequested++; + if (this.characteristicSets) + statsRequested++; + if (this.typeCount) + statsRequested++; + if (this.dataTypeCount) + statsRequested++; + if (this.namespaceCount) + statsRequested++; + + // Error if no statistics requested + if (statsRequested == 0) { + System.err + .println("You did not request any statistics to be calculated, please use one/more of the relevant options to select the statistics to be computed"); + return; + } + int statsComputed = 1; + + // Compute statistics + if (this.nodeCount) { + Job job = this.selectNodeCountJob(); + statsComputed = this.computeStatistic(job, statsComputed, statsRequested); + } + if (this.typeCount) { + Job[] jobs = this.selectTypeCountJobs(); + statsComputed = this.computeStatistic(jobs, false, false, statsComputed, statsRequested); + } + if (this.dataTypeCount) { + Job job = this.selectDataTypeCountJob(); + statsComputed = this.computeStatistic(job, statsComputed, statsRequested); + } + if (this.namespaceCount) { + Job job = this.selectNamespaceCountJob(); + statsComputed = this.computeStatistic(job, statsComputed, statsRequested); + } + if (this.characteristicSets) { + Job[] jobs = this.selectCharacteristicSetJobs(); + statsComputed = this.computeStatistic(jobs, false, false, statsComputed, statsRequested); + } + } + + private int computeStatistic(Job job, int statsComputed, int statsRequested) throws Throwable { + System.out.println(String.format("Computing Statistic %d of %d requested", statsComputed, statsRequested)); + this.runJob(job); + System.out.println(String.format("Computed Statistic %d of %d requested", statsComputed, statsRequested)); + System.out.println(); + return ++statsComputed; + } + + private int computeStatistic(Job[] jobs, boolean continueOnFailure, boolean continueOnError, int statsComputed, + int statsRequested) { + System.out.println(String.format("Computing Statistic %d of %d requested", statsComputed, statsRequested)); + this.runJobSequence(jobs, continueOnFailure, continueOnError); + System.out.println(String.format("Computed Statistic %d of %d requested", statsComputed, statsRequested)); + System.out.println(); + return ++statsComputed; + } + + private boolean runJob(Job job) throws Throwable { + System.out.println("Submitting Job " + job.getJobName()); + long start = System.nanoTime(); + try { + job.submit(); + if (job.monitorAndPrintJob()) { + System.out.println("Job " + job.getJobName() + " succeeded"); + return true; + } else { + System.out.println("Job " + job.getJobName() + " failed"); + return false; + } + } catch (Throwable e) { + System.out.println("Unexpected failure in Job " + job.getJobName()); + throw e; + } finally { + long end = System.nanoTime(); + System.out.println("Job " + job.getJobName() + " finished after " + + String.format("%,d milliseconds", TimeUnit.NANOSECONDS.toMillis(end - start))); + System.out.println(); + } + } + + private void runJobSequence(Job[] jobs, boolean continueOnFailure, boolean continueOnError) { + for (int i = 0; i < jobs.length; i++) { + Job job = jobs[i]; + try { + boolean success = this.runJob(job); + if (!success && !continueOnFailure) + throw new IllegalStateException("Unable to complete job sequence because Job " + job.getJobName() + " failed"); + } catch (IllegalStateException e) { + throw e; + } catch (Throwable e) { + if (!continueOnError) + throw new IllegalStateException("Unable to complete job sequence because job " + job.getJobName() + + " errorred", e); + } + } + } + + private Job selectNodeCountJob() throws IOException { + String realOutputPath = outputPath + "node-counts/"; + String[] inputs = new String[this.inputPaths.size()]; + this.inputPaths.toArray(inputs); + + if (DATA_TYPE_QUADS.equals(this.inputType)) { + return JobFactory.getQuadNodeCountJob(this.config, inputs, realOutputPath); + } else if (DATA_TYPE_TRIPLES.equals(this.inputType)) { + return JobFactory.getTripleNodeCountJob(this.config, inputs, realOutputPath); + } else { + return JobFactory.getNodeCountJob(this.config, inputs, realOutputPath); + } + } + + private Job selectDataTypeCountJob() throws IOException { + String realOutputPath = outputPath + "data-type-counts/"; + String[] inputs = new String[this.inputPaths.size()]; + this.inputPaths.toArray(inputs); + + if (DATA_TYPE_QUADS.equals(this.inputType)) { + return JobFactory.getQuadDataTypeCountJob(this.config, inputs, realOutputPath); + } else if (DATA_TYPE_TRIPLES.equals(this.inputType)) { + return JobFactory.getTripleDataTypeCountJob(this.config, inputs, realOutputPath); + } else { + return JobFactory.getDataTypeCountJob(this.config, inputs, realOutputPath); + } + } + + private Job selectNamespaceCountJob() throws IOException { + String realOutputPath = outputPath + "namespace-counts/"; + String[] inputs = new String[this.inputPaths.size()]; + this.inputPaths.toArray(inputs); + + if (DATA_TYPE_QUADS.equals(this.inputType)) { + return JobFactory.getQuadNamespaceCountJob(this.config, inputs, realOutputPath); + } else if (DATA_TYPE_TRIPLES.equals(this.inputType)) { + return JobFactory.getTripleNamespaceCountJob(this.config, inputs, realOutputPath); + } else { + return JobFactory.getNamespaceCountJob(this.config, inputs, realOutputPath); + } + } + + private Job[] selectCharacteristicSetJobs() throws IOException { + String intermediateOutputPath = outputPath + "characteristics/intermediate/"; + String finalOutputPath = outputPath + "characteristics/final/"; + String[] inputs = new String[this.inputPaths.size()]; + this.inputPaths.toArray(inputs); + + if (DATA_TYPE_QUADS.equals(this.inputType)) { + return JobFactory.getQuadCharacteristicSetJobs(this.config, inputs, intermediateOutputPath, finalOutputPath); + } else if (DATA_TYPE_TRIPLES.equals(this.inputType)) { + return JobFactory.getTripleCharacteristicSetJobs(this.config, inputs, intermediateOutputPath, finalOutputPath); + } else { + return JobFactory.getCharacteristicSetJobs(this.config, inputs, intermediateOutputPath, finalOutputPath); + } + } + + private Job[] selectTypeCountJobs() throws IOException { + String intermediateOutputPath = outputPath + "type-declarations/"; + String finalOutputPath = outputPath + "type-counts/"; + String[] inputs = new String[this.inputPaths.size()]; + this.inputPaths.toArray(inputs); + + if (DATA_TYPE_QUADS.equals(this.inputType)) { + return JobFactory.getQuadTypeCountJobs(this.config, inputs, intermediateOutputPath, finalOutputPath); + } else if (DATA_TYPE_TRIPLES.equals(this.inputType)) { + return JobFactory.getTripleTypeCountJobs(this.config, inputs, intermediateOutputPath, finalOutputPath); + } else { + return JobFactory.getTypeCountJobs(this.config, inputs, intermediateOutputPath, finalOutputPath); + } + } +} http://git-wip-us.apache.org/repos/asf/jena/blob/05c389be/hadoop-rdf/hadoop-rdf-stats/src/main/java/org/apache/jena/hadoop/rdf/stats/jobs/JobFactory.java ---------------------------------------------------------------------- diff --git a/hadoop-rdf/hadoop-rdf-stats/src/main/java/org/apache/jena/hadoop/rdf/stats/jobs/JobFactory.java b/hadoop-rdf/hadoop-rdf-stats/src/main/java/org/apache/jena/hadoop/rdf/stats/jobs/JobFactory.java new file mode 100644 index 0000000..10ba65d --- /dev/null +++ b/hadoop-rdf/hadoop-rdf-stats/src/main/java/org/apache/jena/hadoop/rdf/stats/jobs/JobFactory.java @@ -0,0 +1,757 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.jena.hadoop.rdf.stats.jobs; + +import java.io.IOException; + +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.io.LongWritable; +import org.apache.hadoop.io.NullWritable; +import org.apache.hadoop.io.SequenceFile.CompressionType; +import org.apache.hadoop.io.Text; +import org.apache.hadoop.io.compress.BZip2Codec; +import org.apache.hadoop.mapreduce.Job; +import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; +import org.apache.hadoop.mapreduce.lib.input.NLineInputFormat; +import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; +import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; +import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; +import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; +import org.apache.hadoop.util.StringUtils; +import org.apache.jena.hadoop.rdf.io.input.NQuadsInputFormat; +import org.apache.jena.hadoop.rdf.io.input.NTriplesInputFormat; +import org.apache.jena.hadoop.rdf.io.input.QuadsInputFormat; +import org.apache.jena.hadoop.rdf.io.input.TriplesInputFormat; +import org.apache.jena.hadoop.rdf.io.input.TriplesOrQuadsInputFormat; +import org.apache.jena.hadoop.rdf.io.output.NQuadsOutputFormat; +import org.apache.jena.hadoop.rdf.io.output.NTriplesNodeOutputFormat; +import org.apache.jena.hadoop.rdf.io.output.NTriplesOutputFormat; +import org.apache.jena.hadoop.rdf.mapreduce.KeyMapper; +import org.apache.jena.hadoop.rdf.mapreduce.RdfMapReduceConstants; +import org.apache.jena.hadoop.rdf.mapreduce.TextCountReducer; +import org.apache.jena.hadoop.rdf.mapreduce.characteristics.CharacteristicSetReducer; +import org.apache.jena.hadoop.rdf.mapreduce.characteristics.QuadCharacteristicSetGeneratingReducer; +import org.apache.jena.hadoop.rdf.mapreduce.characteristics.TripleCharacteristicSetGeneratingReducer; +import org.apache.jena.hadoop.rdf.mapreduce.count.NodeCountReducer; +import org.apache.jena.hadoop.rdf.mapreduce.count.QuadNodeCountMapper; +import org.apache.jena.hadoop.rdf.mapreduce.count.TripleNodeCountMapper; +import org.apache.jena.hadoop.rdf.mapreduce.count.datatypes.QuadDataTypeCountMapper; +import org.apache.jena.hadoop.rdf.mapreduce.count.datatypes.TripleDataTypeCountMapper; +import org.apache.jena.hadoop.rdf.mapreduce.count.namespaces.QuadNamespaceCountMapper; +import org.apache.jena.hadoop.rdf.mapreduce.count.namespaces.TripleNamespaceCountMapper; +import org.apache.jena.hadoop.rdf.mapreduce.count.positional.QuadObjectCountMapper; +import org.apache.jena.hadoop.rdf.mapreduce.count.positional.TripleObjectCountMapper; +import org.apache.jena.hadoop.rdf.mapreduce.filter.positional.QuadFilterByPredicateMapper; +import org.apache.jena.hadoop.rdf.mapreduce.filter.positional.TripleFilterByPredicateUriMapper; +import org.apache.jena.hadoop.rdf.mapreduce.group.QuadGroupBySubjectMapper; +import org.apache.jena.hadoop.rdf.mapreduce.group.TripleGroupBySubjectMapper; +import org.apache.jena.hadoop.rdf.types.CharacteristicSetWritable; +import org.apache.jena.hadoop.rdf.types.NodeWritable; +import org.apache.jena.hadoop.rdf.types.QuadWritable; +import org.apache.jena.hadoop.rdf.types.TripleWritable; + +import com.hp.hpl.jena.vocabulary.RDF; + +/** + * Factory that can produce {@link Job} instances for computing various RDF + * statistics + * + * + * + */ +public class JobFactory { + + /** + * Private constructor prevents instantiation + */ + private JobFactory() { + } + + /** + * Gets a job for computing node counts on RDF triple inputs + * + * @param config + * Configuration + * @param inputPaths + * Input paths + * @param outputPath + * Output path + * @return Job + * @throws IOException + */ + public static Job getTripleNodeCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException { + Job job = Job.getInstance(config); + job.setJarByClass(JobFactory.class); + job.setJobName("RDF Triples Node Usage Count"); + + // Map/Reduce classes + job.setMapperClass(TripleNodeCountMapper.class); + job.setMapOutputKeyClass(NodeWritable.class); + job.setMapOutputValueClass(LongWritable.class); + job.setReducerClass(NodeCountReducer.class); + + // Input and Output + job.setInputFormatClass(TriplesInputFormat.class); + job.setOutputFormatClass(NTriplesNodeOutputFormat.class); + FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths)); + FileOutputFormat.setOutputPath(job, new Path(outputPath)); + + return job; + } + + /** + * Gets a job for computing node counts on RDF quad inputs + * + * @param config + * Configuration + * @param inputPaths + * Input paths + * @param outputPath + * Output path + * @return Job + * @throws IOException + */ + public static Job getQuadNodeCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException { + Job job = Job.getInstance(config); + job.setJarByClass(JobFactory.class); + job.setJobName("RDF Quads Node Usage Count"); + + // Map/Reduce classes + job.setMapperClass(QuadNodeCountMapper.class); + job.setMapOutputKeyClass(NodeWritable.class); + job.setMapOutputValueClass(LongWritable.class); + job.setReducerClass(NodeCountReducer.class); + + // Input and Output + job.setInputFormatClass(QuadsInputFormat.class); + job.setOutputFormatClass(NTriplesNodeOutputFormat.class); + FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths)); + FileOutputFormat.setOutputPath(job, new Path(outputPath)); + + return job; + } + + /** + * Gets a job for computing node counts on RDF triple and/or quad inputs + * + * @param config + * Configuration + * @param inputPaths + * Input paths + * @param outputPath + * Output path + * @return Job + * @throws IOException + */ + public static Job getNodeCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException { + Job job = Job.getInstance(config); + job.setJarByClass(JobFactory.class); + job.setJobName("RDF Node Usage Count"); + + // Map/Reduce classes + job.setMapperClass(QuadNodeCountMapper.class); + job.setMapOutputKeyClass(NodeWritable.class); + job.setMapOutputValueClass(LongWritable.class); + job.setReducerClass(NodeCountReducer.class); + + // Input and Output + job.setInputFormatClass(TriplesOrQuadsInputFormat.class); + job.setOutputFormatClass(NTriplesNodeOutputFormat.class); + FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths)); + FileOutputFormat.setOutputPath(job, new Path(outputPath)); + + return job; + } + + /** + * Gets a sequence of jobs that can be used to compute characteristic sets + * for RDF triples + * + * @param config + * Configuration + * @param inputPaths + * Input paths + * @param intermediateOutputPath + * Intermediate output path + * @param outputPath + * Final output path + * @return Sequence of jobs + * @throws IOException + */ + public static Job[] getTripleCharacteristicSetJobs(Configuration config, String[] inputPaths, String intermediateOutputPath, + String outputPath) throws IOException { + Job[] jobs = new Job[2]; + + Job job = Job.getInstance(config); + job.setJarByClass(JobFactory.class); + job.setJobName("RDF Triples Characteristic Set (Generation)"); + + // Map/Reduce classes + job.setMapperClass(TripleGroupBySubjectMapper.class); + job.setMapOutputKeyClass(NodeWritable.class); + job.setMapOutputValueClass(TripleWritable.class); + job.setReducerClass(TripleCharacteristicSetGeneratingReducer.class); + job.setOutputKeyClass(CharacteristicSetWritable.class); + job.setOutputValueClass(NullWritable.class); + + // Input and Output + job.setInputFormatClass(TriplesInputFormat.class); + job.setOutputFormatClass(SequenceFileOutputFormat.class); + FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths)); + FileOutputFormat.setOutputPath(job, new Path(intermediateOutputPath)); + SequenceFileOutputFormat.setCompressOutput(job, true); + FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class); + SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); + + jobs[0] = job; + + job = Job.getInstance(config); + job.setJarByClass(JobFactory.class); + job.setJobName("RDF Triples Characteristic Set (Reduction)"); + + // Map/Reduce classes + job.setMapperClass(KeyMapper.class); + job.setMapOutputKeyClass(CharacteristicSetWritable.class); + job.setMapOutputValueClass(CharacteristicSetWritable.class); + job.setReducerClass(CharacteristicSetReducer.class); + job.setOutputKeyClass(CharacteristicSetWritable.class); + job.setOutputValueClass(CharacteristicSetWritable.class); + + // Input and Output + job.setInputFormatClass(SequenceFileInputFormat.class); + job.setOutputFormatClass(TextOutputFormat.class); + FileInputFormat.setInputPaths(job, intermediateOutputPath); + FileOutputFormat.setOutputPath(job, new Path(outputPath)); + + jobs[1] = job; + return jobs; + } + + /** + * Gets a sequence of jobs that can be used to compute characteristic sets + * for RDF quads + * + * @param config + * Configuration + * @param inputPaths + * Input paths + * @param intermediateOutputPath + * Intermediate output path + * @param outputPath + * Final output path + * @return Sequence of jobs + * @throws IOException + */ + public static Job[] getQuadCharacteristicSetJobs(Configuration config, String[] inputPaths, String intermediateOutputPath, + String outputPath) throws IOException { + Job[] jobs = new Job[2]; + + Job job = Job.getInstance(config); + job.setJarByClass(JobFactory.class); + job.setJobName("RDF Quads Characteristic Set (Generation)"); + + // Map/Reduce classes + job.setMapperClass(QuadGroupBySubjectMapper.class); + job.setMapOutputKeyClass(NodeWritable.class); + job.setMapOutputValueClass(QuadWritable.class); + job.setReducerClass(QuadCharacteristicSetGeneratingReducer.class); + job.setOutputKeyClass(CharacteristicSetWritable.class); + job.setOutputValueClass(NullWritable.class); + + // Input and Output + job.setInputFormatClass(QuadsInputFormat.class); + job.setOutputFormatClass(SequenceFileOutputFormat.class); + FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths)); + FileOutputFormat.setOutputPath(job, new Path(intermediateOutputPath)); + SequenceFileOutputFormat.setCompressOutput(job, true); + FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class); + SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); + + jobs[0] = job; + + job = Job.getInstance(config); + job.setJarByClass(JobFactory.class); + job.setJobName("RDF Quads Characteristic Set (Reduction)"); + + // Map/Reduce classes + job.setMapperClass(KeyMapper.class); + job.setMapOutputKeyClass(CharacteristicSetWritable.class); + job.setMapOutputValueClass(CharacteristicSetWritable.class); + job.setReducerClass(CharacteristicSetReducer.class); + job.setOutputKeyClass(CharacteristicSetWritable.class); + job.setOutputValueClass(CharacteristicSetWritable.class); + + // Input and Output + job.setInputFormatClass(SequenceFileInputFormat.class); + job.setOutputFormatClass(TextOutputFormat.class); + FileInputFormat.setInputPaths(job, intermediateOutputPath); + FileOutputFormat.setOutputPath(job, new Path(outputPath)); + + jobs[1] = job; + return jobs; + } + + /** + * Gets a sequence of jobs that can be used to compute characteristic sets + * for RDF triple and/or quad inputs + * + * @param config + * Configuration + * @param inputPaths + * Input paths + * @param intermediateOutputPath + * Intermediate output path + * @param outputPath + * Final output path + * @return Sequence of jobs + * @throws IOException + */ + public static Job[] getCharacteristicSetJobs(Configuration config, String[] inputPaths, String intermediateOutputPath, + String outputPath) throws IOException { + Job[] jobs = new Job[2]; + + Job job = Job.getInstance(config); + job.setJarByClass(JobFactory.class); + job.setJobName("RDF Characteristic Set (Generation)"); + + // Map/Reduce classes + job.setMapperClass(QuadGroupBySubjectMapper.class); + job.setMapOutputKeyClass(NodeWritable.class); + job.setMapOutputValueClass(QuadWritable.class); + job.setReducerClass(QuadCharacteristicSetGeneratingReducer.class); + job.setOutputKeyClass(CharacteristicSetWritable.class); + job.setOutputValueClass(NullWritable.class); + + // Input and Output + job.setInputFormatClass(TriplesOrQuadsInputFormat.class); + job.setOutputFormatClass(SequenceFileOutputFormat.class); + FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths)); + FileOutputFormat.setOutputPath(job, new Path(intermediateOutputPath)); + SequenceFileOutputFormat.setCompressOutput(job, true); + FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class); + SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); + + jobs[0] = job; + + job = Job.getInstance(config); + job.setJarByClass(JobFactory.class); + job.setJobName("RDF Characteristic Set (Reduction)"); + + // Map/Reduce classes + job.setMapperClass(KeyMapper.class); + job.setMapOutputKeyClass(CharacteristicSetWritable.class); + job.setMapOutputValueClass(CharacteristicSetWritable.class); + job.setReducerClass(CharacteristicSetReducer.class); + job.setOutputKeyClass(CharacteristicSetWritable.class); + job.setOutputValueClass(CharacteristicSetWritable.class); + + // Input and Output + job.setInputFormatClass(SequenceFileInputFormat.class); + job.setOutputFormatClass(TextOutputFormat.class); + FileInputFormat.setInputPaths(job, intermediateOutputPath); + FileOutputFormat.setOutputPath(job, new Path(outputPath)); + + jobs[1] = job; + return jobs; + } + + /** + * Gets a job for computing type counts on RDF triple inputs + * + * @param config + * Configuration + * @param inputPaths + * Input paths + * @param intermediateOutputPath + * Path for intermediate output which will be all the type + * declaration triples present in the inputs + * @param outputPath + * Output path + * @return Job + * @throws IOException + */ + public static Job[] getTripleTypeCountJobs(Configuration config, String[] inputPaths, String intermediateOutputPath, + String outputPath) throws IOException { + Job[] jobs = new Job[2]; + + Job job = Job.getInstance(config); + job.setJarByClass(JobFactory.class); + job.setJobName("RDF Type Triples Extraction"); + + // Map/Reduce classes + job.getConfiguration().setStrings(RdfMapReduceConstants.FILTER_PREDICATE_URIS, RDF.type.getURI()); + job.setMapperClass(TripleFilterByPredicateUriMapper.class); + job.setMapOutputKeyClass(LongWritable.class); + job.setMapOutputValueClass(TripleWritable.class); + + // Input and Output Format + job.setInputFormatClass(TriplesInputFormat.class); + job.setOutputFormatClass(NTriplesOutputFormat.class); + FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths)); + FileOutputFormat.setOutputPath(job, new Path(intermediateOutputPath)); + + jobs[0] = job; + + // Object Node Usage count job + job = Job.getInstance(config); + job.setJarByClass(JobFactory.class); + job.setJobName("RDF Triples Type Usage Count"); + + // Map/Reduce classes + job.setMapperClass(TripleObjectCountMapper.class); + job.setMapOutputKeyClass(NodeWritable.class); + job.setMapOutputValueClass(LongWritable.class); + job.setReducerClass(NodeCountReducer.class); + + // Input and Output + job.setInputFormatClass(NTriplesInputFormat.class); + NLineInputFormat.setNumLinesPerSplit(job, 10000); // TODO Would be + // better if this was + // intelligently + // configured + job.setOutputFormatClass(NTriplesNodeOutputFormat.class); + FileInputFormat.setInputPaths(job, intermediateOutputPath); + FileOutputFormat.setOutputPath(job, new Path(outputPath)); + + jobs[1] = job; + + return jobs; + } + + /** + * Gets a job for computing type counts on RDF quad inputs + * + * @param config + * Configuration + * @param inputPaths + * Input paths + * @param intermediateOutputPath + * Path for intermediate output which will be all the type + * declaration quads present in the inputs + * @param outputPath + * Output path + * @return Job + * @throws IOException + */ + public static Job[] getQuadTypeCountJobs(Configuration config, String[] inputPaths, String intermediateOutputPath, + String outputPath) throws IOException { + Job[] jobs = new Job[2]; + + Job job = Job.getInstance(config); + job.setJarByClass(JobFactory.class); + job.setJobName("RDF Type Quads Extraction"); + + // Map/Reduce classes + job.getConfiguration().setStrings(RdfMapReduceConstants.FILTER_PREDICATE_URIS, RDF.type.getURI()); + job.setMapperClass(QuadFilterByPredicateMapper.class); + job.setMapOutputKeyClass(LongWritable.class); + job.setMapOutputValueClass(QuadWritable.class); + + // Input and Output Format + job.setInputFormatClass(QuadsInputFormat.class); + job.setOutputFormatClass(NQuadsOutputFormat.class); + FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths)); + FileOutputFormat.setOutputPath(job, new Path(intermediateOutputPath)); + + jobs[0] = job; + + // Object Node Usage count job + job = Job.getInstance(config); + job.setJarByClass(JobFactory.class); + job.setJobName("RDF Quads Type Usage Count"); + + // Map/Reduce classes + job.setMapperClass(QuadObjectCountMapper.class); + job.setMapOutputKeyClass(NodeWritable.class); + job.setMapOutputValueClass(LongWritable.class); + job.setReducerClass(NodeCountReducer.class); + + // Input and Output + job.setInputFormatClass(NQuadsInputFormat.class); + NLineInputFormat.setNumLinesPerSplit(job, 10000); // TODO Would be + // better if this was + // intelligently + // configured + job.setOutputFormatClass(NTriplesNodeOutputFormat.class); + FileInputFormat.setInputPaths(job, intermediateOutputPath); + FileOutputFormat.setOutputPath(job, new Path(outputPath)); + + jobs[1] = job; + + return jobs; + } + + /** + * Gets a job for computing type counts on RDF triple and/or quad inputs + * + * @param config + * Configuration + * @param inputPaths + * Input paths + * @param intermediateOutputPath + * Path for intermediate output which will be all the type + * declaration quads present in the inputs + * @param outputPath + * Output path + * @return Job + * @throws IOException + */ + public static Job[] getTypeCountJobs(Configuration config, String[] inputPaths, String intermediateOutputPath, + String outputPath) throws IOException { + Job[] jobs = new Job[2]; + + Job job = Job.getInstance(config); + job.setJarByClass(JobFactory.class); + job.setJobName("RDF Type Extraction"); + + // Map/Reduce classes + job.getConfiguration().setStrings(RdfMapReduceConstants.FILTER_PREDICATE_URIS, RDF.type.getURI()); + job.setMapperClass(QuadFilterByPredicateMapper.class); + job.setMapOutputKeyClass(LongWritable.class); + job.setMapOutputValueClass(QuadWritable.class); + + // Input and Output Format + job.setInputFormatClass(TriplesOrQuadsInputFormat.class); + job.setOutputFormatClass(NQuadsOutputFormat.class); + FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths)); + FileOutputFormat.setOutputPath(job, new Path(intermediateOutputPath)); + + jobs[0] = job; + + // Object Node Usage count job + job = Job.getInstance(config); + job.setJarByClass(JobFactory.class); + job.setJobName("RDF Type Usage Count"); + + // Map/Reduce classes + job.setMapperClass(QuadObjectCountMapper.class); + job.setMapOutputKeyClass(NodeWritable.class); + job.setMapOutputValueClass(LongWritable.class); + job.setReducerClass(NodeCountReducer.class); + + // Input and Output + job.setInputFormatClass(NQuadsInputFormat.class); + NLineInputFormat.setNumLinesPerSplit(job, 10000); // TODO Would be + // better if this was + // intelligently + // configured + job.setOutputFormatClass(NTriplesNodeOutputFormat.class); + FileInputFormat.setInputPaths(job, intermediateOutputPath); + FileOutputFormat.setOutputPath(job, new Path(outputPath)); + + jobs[1] = job; + + return jobs; + } + + /** + * Gets a job for computing literal data type counts on RDF triple inputs + * + * @param config + * Configuration + * @param inputPaths + * Input paths + * @param outputPath + * Output path + * @return Job + * @throws IOException + */ + public static Job getTripleDataTypeCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException { + Job job = Job.getInstance(config); + job.setJarByClass(JobFactory.class); + job.setJobName("RDF Triples Literal Data Type Usage Count"); + + // Map/Reduce classes + job.setMapperClass(TripleDataTypeCountMapper.class); + job.setMapOutputKeyClass(NodeWritable.class); + job.setMapOutputValueClass(LongWritable.class); + job.setReducerClass(NodeCountReducer.class); + + // Input and Output + job.setInputFormatClass(TriplesInputFormat.class); + job.setOutputFormatClass(NTriplesNodeOutputFormat.class); + FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths)); + FileOutputFormat.setOutputPath(job, new Path(outputPath)); + + return job; + } + + /** + * Gets a job for computing literal data type counts on RDF quad inputs + * + * @param config + * Configuration + * @param inputPaths + * Input paths + * @param outputPath + * Output path + * @return Job + * @throws IOException + */ + public static Job getQuadDataTypeCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException { + Job job = Job.getInstance(config); + job.setJarByClass(JobFactory.class); + job.setJobName("RDF Quads Literal Data Type Usage Count"); + + // Map/Reduce classes + job.setMapperClass(QuadDataTypeCountMapper.class); + job.setMapOutputKeyClass(NodeWritable.class); + job.setMapOutputValueClass(LongWritable.class); + job.setReducerClass(NodeCountReducer.class); + + // Input and Output + job.setInputFormatClass(QuadsInputFormat.class); + job.setOutputFormatClass(NTriplesNodeOutputFormat.class); + FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths)); + FileOutputFormat.setOutputPath(job, new Path(outputPath)); + + return job; + } + + /** + * Gets a job for computing literal data type counts on RDF triple and/or + * quad inputs + * + * @param config + * Configuration + * @param inputPaths + * Input paths + * @param outputPath + * Output path + * @return Job + * @throws IOException + */ + public static Job getDataTypeCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException { + Job job = Job.getInstance(config); + job.setJarByClass(JobFactory.class); + job.setJobName("RDF Literal Data Type Usage Count"); + + // Map/Reduce classes + job.setMapperClass(QuadDataTypeCountMapper.class); + job.setMapOutputKeyClass(NodeWritable.class); + job.setMapOutputValueClass(LongWritable.class); + job.setReducerClass(NodeCountReducer.class); + + // Input and Output + job.setInputFormatClass(TriplesOrQuadsInputFormat.class); + job.setOutputFormatClass(NTriplesNodeOutputFormat.class); + FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths)); + FileOutputFormat.setOutputPath(job, new Path(outputPath)); + + return job; + } + + /** + * Gets a job for computing literal data type counts on RDF triple inputs + * + * @param config + * Configuration + * @param inputPaths + * Input paths + * @param outputPath + * Output path + * @return Job + * @throws IOException + */ + public static Job getTripleNamespaceCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException { + Job job = Job.getInstance(config); + job.setJarByClass(JobFactory.class); + job.setJobName("RDF Triples Namespace Usage Count"); + + // Map/Reduce classes + job.setMapperClass(TripleNamespaceCountMapper.class); + job.setMapOutputKeyClass(Text.class); + job.setMapOutputValueClass(LongWritable.class); + job.setReducerClass(TextCountReducer.class); + + // Input and Output + job.setInputFormatClass(TriplesInputFormat.class); + job.setOutputFormatClass(TextOutputFormat.class); + FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths)); + FileOutputFormat.setOutputPath(job, new Path(outputPath)); + + return job; + } + + /** + * Gets a job for computing literal data type counts on RDF quad inputs + * + * @param config + * Configuration + * @param inputPaths + * Input paths + * @param outputPath + * Output path + * @return Job + * @throws IOException + */ + public static Job getQuadNamespaceCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException { + Job job = Job.getInstance(config); + job.setJarByClass(JobFactory.class); + job.setJobName("RDF Quads Namespace Usage Count"); + + // Map/Reduce classes + job.setMapperClass(QuadNamespaceCountMapper.class); + job.setMapOutputKeyClass(Text.class); + job.setMapOutputValueClass(LongWritable.class); + job.setReducerClass(TextCountReducer.class); + + // Input and Output + job.setInputFormatClass(QuadsInputFormat.class); + job.setOutputFormatClass(TextOutputFormat.class); + FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths)); + FileOutputFormat.setOutputPath(job, new Path(outputPath)); + + return job; + } + + /** + * Gets a job for computing literal data type counts on RDF triple and/or + * quad inputs + * + * @param config + * Configuration + * @param inputPaths + * Input paths + * @param outputPath + * Output path + * @return Job + * @throws IOException + */ + public static Job getNamespaceCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException { + Job job = Job.getInstance(config); + job.setJarByClass(JobFactory.class); + job.setJobName("RDF Namespace Usage Count"); + + // Map/Reduce classes + job.setMapperClass(QuadNamespaceCountMapper.class); + job.setMapOutputKeyClass(Text.class); + job.setMapOutputValueClass(LongWritable.class); + job.setReducerClass(TextCountReducer.class); + + // Input and Output + job.setInputFormatClass(TriplesOrQuadsInputFormat.class); + job.setOutputFormatClass(TextOutputFormat.class); + FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths)); + FileOutputFormat.setOutputPath(job, new Path(outputPath)); + + return job; + } +} http://git-wip-us.apache.org/repos/asf/jena/blob/05c389be/hadoop-rdf/pom.xml ---------------------------------------------------------------------- diff --git a/hadoop-rdf/pom.xml b/hadoop-rdf/pom.xml new file mode 100644 index 0000000..30a6fbd --- /dev/null +++ b/hadoop-rdf/pom.xml @@ -0,0 +1,142 @@ +<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor + license agreements. See the NOTICE file distributed with this work for additional + information regarding copyright ownership. The ASF licenses this file to + You under the Apache License, Version 2.0 (the "License"); you may not use + this file except in compliance with the License. You may obtain a copy of + the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required + by applicable law or agreed to in writing, software distributed under the + License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS + OF ANY KIND, either express or implied. See the License for the specific + language governing permissions and limitations under the License. --> +<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" + xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> + <modelVersion>4.0.0</modelVersion> + <groupId>org.apache.jena</groupId> + <artifactId>jena-hadoop-rdf</artifactId> + <version>0.9.0-SNAPSHOT</version> + <packaging>pom</packaging> + + <!-- <parent> <groupId>org.apache.jena</groupId> <artifactId>jena-parent</artifactId> + <version>10-SNAPSHOT</version> <relativePath /> </parent> --> + + <name>Apache Jena - RDF Tools for Hadoop</name> + <description>A collection of tools for working with RDF on the Hadoop platform</description> + + <modules> + <module>hadoop-rdf-io</module> + <module>hadoop-rdf-common</module> + <module>hadoop-rdf-mapreduce</module> + <module>hadoop-rdf-stats</module> + </modules> + + <!-- Properties common across all profiles --> + <properties> + <plugin.compiler.version>2.5.1</plugin.compiler.version> + <arq.version>2.12.1-SNAPSHOT</arq.version> + <junit.version>4.11</junit.version> + <mrunit.version>1.0.0</mrunit.version> + </properties> + + <!-- Profiles to allow building for different Hadoop versions --> + <!-- Currently there is only a single profile targeting Hadoop 2.x because + we're using the newer MRv2 APIs which aren't backwards compatible with Hadoop + 1.x versions --> + <profiles> + <!-- Hadoop 2.x Stable --> + <profile> + <id>hadoop_2x</id> + <activation> + <activeByDefault>true</activeByDefault> + </activation> + <properties> + <hadoop.version>2.4.1</hadoop.version> + </properties> + </profile> + + <!-- Hadoop 0.23 --> + <profile> + <id>hadoop_023x</id> + <properties> + <hadoop.version>0.23.9</hadoop.version> + </properties> + </profile> + </profiles> + + <dependencyManagement> + <dependencies> + <!-- Hadoop Dependencies --> + <dependency> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-common</artifactId> + <version>${hadoop.version}</version> + </dependency> + <dependency> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-mapreduce-client-common</artifactId> + <version>${hadoop.version}</version> + </dependency> + + <!-- Jena Dependencies --> + <dependency> + <groupId>org.apache.jena</groupId> + <artifactId>jena-arq</artifactId> + <version>${arq.version}</version> + </dependency> + + <!-- Test Dependencies --> + <dependency> + <groupId>org.apache.hadoop</groupId> + <artifactId>hadoop-minicluster</artifactId> + <version>${hadoop.version}</version> + </dependency> + <dependency> + <groupId>junit</groupId> + <artifactId>junit</artifactId> + <version>${junit.version}</version> + </dependency> + <dependency> + <groupId>org.apache.mrunit</groupId> + <artifactId>mrunit</artifactId> + <version>${mrunit.version}</version> + <classifier>hadoop2</classifier> + </dependency> + </dependencies> + </dependencyManagement> + + <build> + <plugins> + <plugin> + <artifactId>maven-compiler-plugin</artifactId> + <version>${plugin.compiler.version}</version> + <configuration> + <source>1.7</source> + <target>1.7</target> + </configuration> + </plugin> + <plugin> + <groupId>org.apache.maven.plugins</groupId> + <artifactId>maven-enforcer-plugin</artifactId> + <executions> + <execution> + <id>enforce-java</id> + <goals> + <goal>enforce</goal> + </goals> + <configuration> + <rules> + <requireJavaVersion> + <version>1.7.0</version> + </requireJavaVersion> + </rules> + <!-- Hadoop dependencies introduce a huge range of dependency convergence + issues. Therefore we don't fail the builds for these modules which is less + than ideal but far easier than trying to fix the mess that is Hadoops transitive + dependencies --> + <fail>false</fail> + </configuration> + </execution> + </executions> + </plugin> + </plugins> + </build> +</project> http://git-wip-us.apache.org/repos/asf/jena/blob/05c389be/pom.xml ---------------------------------------------------------------------- diff --git a/pom.xml b/pom.xml deleted file mode 100644 index 30a6fbd..0000000 --- a/pom.xml +++ /dev/null @@ -1,142 +0,0 @@ -<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor - license agreements. See the NOTICE file distributed with this work for additional - information regarding copyright ownership. The ASF licenses this file to - You under the Apache License, Version 2.0 (the "License"); you may not use - this file except in compliance with the License. You may obtain a copy of - the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required - by applicable law or agreed to in writing, software distributed under the - License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS - OF ANY KIND, either express or implied. See the License for the specific - language governing permissions and limitations under the License. --> -<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" - xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> - <modelVersion>4.0.0</modelVersion> - <groupId>org.apache.jena</groupId> - <artifactId>jena-hadoop-rdf</artifactId> - <version>0.9.0-SNAPSHOT</version> - <packaging>pom</packaging> - - <!-- <parent> <groupId>org.apache.jena</groupId> <artifactId>jena-parent</artifactId> - <version>10-SNAPSHOT</version> <relativePath /> </parent> --> - - <name>Apache Jena - RDF Tools for Hadoop</name> - <description>A collection of tools for working with RDF on the Hadoop platform</description> - - <modules> - <module>hadoop-rdf-io</module> - <module>hadoop-rdf-common</module> - <module>hadoop-rdf-mapreduce</module> - <module>hadoop-rdf-stats</module> - </modules> - - <!-- Properties common across all profiles --> - <properties> - <plugin.compiler.version>2.5.1</plugin.compiler.version> - <arq.version>2.12.1-SNAPSHOT</arq.version> - <junit.version>4.11</junit.version> - <mrunit.version>1.0.0</mrunit.version> - </properties> - - <!-- Profiles to allow building for different Hadoop versions --> - <!-- Currently there is only a single profile targeting Hadoop 2.x because - we're using the newer MRv2 APIs which aren't backwards compatible with Hadoop - 1.x versions --> - <profiles> - <!-- Hadoop 2.x Stable --> - <profile> - <id>hadoop_2x</id> - <activation> - <activeByDefault>true</activeByDefault> - </activation> - <properties> - <hadoop.version>2.4.1</hadoop.version> - </properties> - </profile> - - <!-- Hadoop 0.23 --> - <profile> - <id>hadoop_023x</id> - <properties> - <hadoop.version>0.23.9</hadoop.version> - </properties> - </profile> - </profiles> - - <dependencyManagement> - <dependencies> - <!-- Hadoop Dependencies --> - <dependency> - <groupId>org.apache.hadoop</groupId> - <artifactId>hadoop-common</artifactId> - <version>${hadoop.version}</version> - </dependency> - <dependency> - <groupId>org.apache.hadoop</groupId> - <artifactId>hadoop-mapreduce-client-common</artifactId> - <version>${hadoop.version}</version> - </dependency> - - <!-- Jena Dependencies --> - <dependency> - <groupId>org.apache.jena</groupId> - <artifactId>jena-arq</artifactId> - <version>${arq.version}</version> - </dependency> - - <!-- Test Dependencies --> - <dependency> - <groupId>org.apache.hadoop</groupId> - <artifactId>hadoop-minicluster</artifactId> - <version>${hadoop.version}</version> - </dependency> - <dependency> - <groupId>junit</groupId> - <artifactId>junit</artifactId> - <version>${junit.version}</version> - </dependency> - <dependency> - <groupId>org.apache.mrunit</groupId> - <artifactId>mrunit</artifactId> - <version>${mrunit.version}</version> - <classifier>hadoop2</classifier> - </dependency> - </dependencies> - </dependencyManagement> - - <build> - <plugins> - <plugin> - <artifactId>maven-compiler-plugin</artifactId> - <version>${plugin.compiler.version}</version> - <configuration> - <source>1.7</source> - <target>1.7</target> - </configuration> - </plugin> - <plugin> - <groupId>org.apache.maven.plugins</groupId> - <artifactId>maven-enforcer-plugin</artifactId> - <executions> - <execution> - <id>enforce-java</id> - <goals> - <goal>enforce</goal> - </goals> - <configuration> - <rules> - <requireJavaVersion> - <version>1.7.0</version> - </requireJavaVersion> - </rules> - <!-- Hadoop dependencies introduce a huge range of dependency convergence - issues. Therefore we don't fail the builds for these modules which is less - than ideal but far easier than trying to fix the mess that is Hadoops transitive - dependencies --> - <fail>false</fail> - </configuration> - </execution> - </executions> - </plugin> - </plugins> - </build> -</project>