http://git-wip-us.apache.org/repos/asf/jena/blob/49c4cffe/jena-hadoop-rdf/jena-elephas-io/src/test/java/org/apache/jena/hadoop/rdf/io/output/trig/TriGBlankNodeOutputTests.java ---------------------------------------------------------------------- diff --git a/jena-hadoop-rdf/jena-elephas-io/src/test/java/org/apache/jena/hadoop/rdf/io/output/trig/TriGBlankNodeOutputTests.java b/jena-hadoop-rdf/jena-elephas-io/src/test/java/org/apache/jena/hadoop/rdf/io/output/trig/TriGBlankNodeOutputTests.java deleted file mode 100644 index c9b3a26..0000000 --- a/jena-hadoop-rdf/jena-elephas-io/src/test/java/org/apache/jena/hadoop/rdf/io/output/trig/TriGBlankNodeOutputTests.java +++ /dev/null @@ -1,120 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.jena.hadoop.rdf.io.output.trig; - -import java.io.File; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collection; -import java.util.HashSet; -import java.util.Iterator; -import java.util.List; -import java.util.Set; - -import org.apache.hadoop.io.NullWritable; -import org.apache.hadoop.mapreduce.OutputFormat; -import org.apache.jena.hadoop.rdf.io.RdfIOConstants; -import org.apache.jena.hadoop.rdf.types.QuadWritable; -import org.apache.jena.riot.RDFDataMgr; -import org.junit.Assert; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; -import org.junit.runners.Parameterized.Parameters; - -import com.hp.hpl.jena.datatypes.xsd.XSDDatatype; -import com.hp.hpl.jena.graph.Node; -import com.hp.hpl.jena.graph.NodeFactory; -import com.hp.hpl.jena.rdf.model.Model; -import com.hp.hpl.jena.rdf.model.ResIterator; -import com.hp.hpl.jena.rdf.model.Resource; -import com.hp.hpl.jena.sparql.core.Quad; - -/** - * Tests for TriG output with blank nodes - * - * - * - */ -@RunWith(Parameterized.class) -public class TriGBlankNodeOutputTests extends StreamedTriGOutputTest { - - static long $bs1 = RdfIOConstants.DEFAULT_OUTPUT_BATCH_SIZE; - static long $bs2 = 1000; - static long $bs3 = 100; - static long $bs4 = 1; - - /** - * @return Test parameters - */ - @Parameters - public static Collection<Object[]> data() { - return Arrays.asList(new Object[][] { { $bs1 }, { $bs2 }, { $bs3 }, - { $bs4 } }); - } - - /** - * Creates new tests - * - * @param batchSize - * Batch size - */ - public TriGBlankNodeOutputTests(long batchSize) { - super(batchSize); - } - - @Override - protected Iterator<QuadWritable> generateTuples(int num) { - List<QuadWritable> qs = new ArrayList<QuadWritable>(); - Node subject = NodeFactory.createAnon(); - for (int i = 0; i < num; i++) { - Quad t = new Quad( - NodeFactory.createURI("http://example.org/graphs/" + i), - subject, - NodeFactory.createURI("http://example.org/predicate"), - NodeFactory.createLiteral(Integer.toString(i), - XSDDatatype.XSDinteger)); - qs.add(new QuadWritable(t)); - } - return qs.iterator(); - } - - @Override - protected void checkTuples(File f, long expected) { - super.checkTuples(f, expected); - - Model m = RDFDataMgr.loadModel("file://" + f.getAbsolutePath(), - this.getRdfLanguage()); - ResIterator iter = m.listSubjects(); - Set<Node> subjects = new HashSet<Node>(); - while (iter.hasNext()) { - Resource res = iter.next(); - Assert.assertTrue(res.isAnon()); - subjects.add(res.asNode()); - } - // Should only be one subject unless the data was empty in which case - // there will be zero subjects - Assert.assertEquals(expected == 0 ? 0 : 1, subjects.size()); - } - - @Override - protected OutputFormat<NullWritable, QuadWritable> getOutputFormat() { - return new TriGOutputFormat<NullWritable>(); - } - -}
http://git-wip-us.apache.org/repos/asf/jena/blob/49c4cffe/jena-hadoop-rdf/jena-elephas-io/src/test/java/org/apache/jena/hadoop/rdf/io/output/trix/TriXOutputTest.java ---------------------------------------------------------------------- diff --git a/jena-hadoop-rdf/jena-elephas-io/src/test/java/org/apache/jena/hadoop/rdf/io/output/trix/TriXOutputTest.java b/jena-hadoop-rdf/jena-elephas-io/src/test/java/org/apache/jena/hadoop/rdf/io/output/trix/TriXOutputTest.java deleted file mode 100644 index 9b6e307..0000000 --- a/jena-hadoop-rdf/jena-elephas-io/src/test/java/org/apache/jena/hadoop/rdf/io/output/trix/TriXOutputTest.java +++ /dev/null @@ -1,47 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.jena.hadoop.rdf.io.output.trix; - -import org.apache.hadoop.io.NullWritable; -import org.apache.hadoop.mapreduce.OutputFormat; -import org.apache.jena.hadoop.rdf.io.output.AbstractQuadOutputFormatTests; -import org.apache.jena.hadoop.rdf.types.QuadWritable; -import org.apache.jena.riot.Lang; - -/** - * Tests for TriX output format - */ -public class TriXOutputTest extends AbstractQuadOutputFormatTests { - - @Override - protected String getFileExtension() { - return ".trix"; - } - - @Override - protected Lang getRdfLanguage() { - return Lang.TRIX; - } - - @Override - protected OutputFormat<NullWritable, QuadWritable> getOutputFormat() { - return new TriXOutputFormat<NullWritable>(); - } - -} http://git-wip-us.apache.org/repos/asf/jena/blob/49c4cffe/jena-hadoop-rdf/jena-elephas-io/src/test/java/org/apache/jena/hadoop/rdf/io/output/turtle/BatchedTurtleOutputTest.java ---------------------------------------------------------------------- diff --git a/jena-hadoop-rdf/jena-elephas-io/src/test/java/org/apache/jena/hadoop/rdf/io/output/turtle/BatchedTurtleOutputTest.java b/jena-hadoop-rdf/jena-elephas-io/src/test/java/org/apache/jena/hadoop/rdf/io/output/turtle/BatchedTurtleOutputTest.java deleted file mode 100644 index a6c4d70..0000000 --- a/jena-hadoop-rdf/jena-elephas-io/src/test/java/org/apache/jena/hadoop/rdf/io/output/turtle/BatchedTurtleOutputTest.java +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.jena.hadoop.rdf.io.output.turtle; - -import java.util.Arrays; -import java.util.Collection; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.NullWritable; -import org.apache.hadoop.mapreduce.OutputFormat; -import org.apache.jena.hadoop.rdf.io.RdfIOConstants; -import org.apache.jena.hadoop.rdf.io.output.AbstractTripleOutputFormatTests; -import org.apache.jena.hadoop.rdf.types.TripleWritable; -import org.apache.jena.riot.Lang; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; -import org.junit.runners.Parameterized.Parameters; - - -/** - * Tests for Turtle output - * - * - * - */ -@RunWith(Parameterized.class) -public class BatchedTurtleOutputTest extends AbstractTripleOutputFormatTests { - - static long $bs1 = RdfIOConstants.DEFAULT_OUTPUT_BATCH_SIZE; - static long $bs2 = 1000; - static long $bs3 = 100; - static long $bs4 = 1; - - /** - * @return Test parameters - */ - @Parameters - public static Collection<Object[]> data() { - return Arrays.asList(new Object[][] { { $bs1 }, { $bs2 }, { $bs3 }, { $bs4 } }); - } - - private final long batchSize; - - /** - * Creates new tests - * - * @param batchSize - * Batch size - */ - public BatchedTurtleOutputTest(long batchSize) { - this.batchSize = batchSize; - } - - @Override - protected String getFileExtension() { - return ".ttl"; - } - - @Override - protected Lang getRdfLanguage() { - return Lang.TURTLE; - } - - @Override - protected Configuration prepareConfiguration() { - Configuration config = super.prepareConfiguration(); - config.setLong(RdfIOConstants.OUTPUT_BATCH_SIZE, this.batchSize); - return config; - } - - @Override - protected OutputFormat<NullWritable, TripleWritable> getOutputFormat() { - return new BatchedTurtleOutputFormat<NullWritable>(); - } - -} http://git-wip-us.apache.org/repos/asf/jena/blob/49c4cffe/jena-hadoop-rdf/jena-elephas-io/src/test/java/org/apache/jena/hadoop/rdf/io/output/turtle/StreamedTurtleOutputTest.java ---------------------------------------------------------------------- diff --git a/jena-hadoop-rdf/jena-elephas-io/src/test/java/org/apache/jena/hadoop/rdf/io/output/turtle/StreamedTurtleOutputTest.java b/jena-hadoop-rdf/jena-elephas-io/src/test/java/org/apache/jena/hadoop/rdf/io/output/turtle/StreamedTurtleOutputTest.java deleted file mode 100644 index d8843d3..0000000 --- a/jena-hadoop-rdf/jena-elephas-io/src/test/java/org/apache/jena/hadoop/rdf/io/output/turtle/StreamedTurtleOutputTest.java +++ /dev/null @@ -1,92 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.jena.hadoop.rdf.io.output.turtle; - -import java.util.Arrays; -import java.util.Collection; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.NullWritable; -import org.apache.hadoop.mapreduce.OutputFormat; -import org.apache.jena.hadoop.rdf.io.RdfIOConstants; -import org.apache.jena.hadoop.rdf.io.output.AbstractTripleOutputFormatTests; -import org.apache.jena.hadoop.rdf.types.TripleWritable; -import org.apache.jena.riot.Lang; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; -import org.junit.runners.Parameterized.Parameters; - - -/** - * Tests for Turtle output - * - * - * - */ -@RunWith(Parameterized.class) -public class StreamedTurtleOutputTest extends AbstractTripleOutputFormatTests { - - static long $bs1 = RdfIOConstants.DEFAULT_OUTPUT_BATCH_SIZE; - static long $bs2 = 1000; - static long $bs3 = 100; - static long $bs4 = 1; - - /** - * @return Test parameters - */ - @Parameters - public static Collection<Object[]> data() { - return Arrays.asList(new Object[][] { { $bs1 }, { $bs2 }, { $bs3 }, { $bs4 } }); - } - - private final long batchSize; - - /** - * Creates new tests - * - * @param batchSize - * Batch size - */ - public StreamedTurtleOutputTest(long batchSize) { - this.batchSize = batchSize; - } - - @Override - protected String getFileExtension() { - return ".ttl"; - } - - @Override - protected Lang getRdfLanguage() { - return Lang.TURTLE; - } - - @Override - protected Configuration prepareConfiguration() { - Configuration config = super.prepareConfiguration(); - config.setLong(RdfIOConstants.OUTPUT_BATCH_SIZE, this.batchSize); - return config; - } - - @Override - protected OutputFormat<NullWritable, TripleWritable> getOutputFormat() { - return new TurtleOutputFormat<NullWritable>(); - } - -} http://git-wip-us.apache.org/repos/asf/jena/blob/49c4cffe/jena-hadoop-rdf/jena-elephas-io/src/test/java/org/apache/jena/hadoop/rdf/io/output/turtle/TurtleBlankNodeOutputTests.java ---------------------------------------------------------------------- diff --git a/jena-hadoop-rdf/jena-elephas-io/src/test/java/org/apache/jena/hadoop/rdf/io/output/turtle/TurtleBlankNodeOutputTests.java b/jena-hadoop-rdf/jena-elephas-io/src/test/java/org/apache/jena/hadoop/rdf/io/output/turtle/TurtleBlankNodeOutputTests.java deleted file mode 100644 index 8dcae4e..0000000 --- a/jena-hadoop-rdf/jena-elephas-io/src/test/java/org/apache/jena/hadoop/rdf/io/output/turtle/TurtleBlankNodeOutputTests.java +++ /dev/null @@ -1,118 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.jena.hadoop.rdf.io.output.turtle; - -import java.io.File; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.Collection; -import java.util.HashSet; -import java.util.Iterator; -import java.util.List; -import java.util.Set; - -import org.apache.hadoop.io.NullWritable; -import org.apache.hadoop.mapreduce.OutputFormat; -import org.apache.jena.hadoop.rdf.io.RdfIOConstants; -import org.apache.jena.hadoop.rdf.types.TripleWritable; -import org.apache.jena.riot.RDFDataMgr; -import org.junit.Assert; -import org.junit.runner.RunWith; -import org.junit.runners.Parameterized; -import org.junit.runners.Parameterized.Parameters; - -import com.hp.hpl.jena.datatypes.xsd.XSDDatatype; -import com.hp.hpl.jena.graph.Node; -import com.hp.hpl.jena.graph.NodeFactory; -import com.hp.hpl.jena.graph.Triple; -import com.hp.hpl.jena.rdf.model.Model; -import com.hp.hpl.jena.rdf.model.ResIterator; -import com.hp.hpl.jena.rdf.model.Resource; - -/** - * Tests for Turtle output with blank nodes - * - * - * - */ -@RunWith(Parameterized.class) -public class TurtleBlankNodeOutputTests extends StreamedTurtleOutputTest { - - static long $bs1 = RdfIOConstants.DEFAULT_OUTPUT_BATCH_SIZE; - static long $bs2 = 1000; - static long $bs3 = 100; - static long $bs4 = 1; - - /** - * @return Test parameters - */ - @Parameters - public static Collection<Object[]> data() { - return Arrays.asList(new Object[][] { { $bs1 }, { $bs2 }, { $bs3 }, - { $bs4 } }); - } - - /** - * Creates new tests - * - * @param batchSize - * Batch size - */ - public TurtleBlankNodeOutputTests(long batchSize) { - super(batchSize); - } - - @Override - protected Iterator<TripleWritable> generateTuples(int num) { - List<TripleWritable> ts = new ArrayList<TripleWritable>(); - Node subject = NodeFactory.createAnon(); - for (int i = 0; i < num; i++) { - Triple t = new Triple(subject, - NodeFactory.createURI("http://example.org/predicate"), - NodeFactory.createLiteral(Integer.toString(i), - XSDDatatype.XSDinteger)); - ts.add(new TripleWritable(t)); - } - return ts.iterator(); - } - - @Override - protected void checkTuples(File f, long expected) { - super.checkTuples(f, expected); - - Model m = RDFDataMgr.loadModel("file://" + f.getAbsolutePath(), - this.getRdfLanguage()); - ResIterator iter = m.listSubjects(); - Set<Node> subjects = new HashSet<Node>(); - while (iter.hasNext()) { - Resource res = iter.next(); - Assert.assertTrue(res.isAnon()); - subjects.add(res.asNode()); - } - // Should only be one subject unless the data was empty in which case - // there will be zero subjects - Assert.assertEquals(expected == 0 ? 0 : 1, subjects.size()); - } - - @Override - protected OutputFormat<NullWritable, TripleWritable> getOutputFormat() { - return new TurtleOutputFormat<NullWritable>(); - } - -} http://git-wip-us.apache.org/repos/asf/jena/blob/49c4cffe/jena-hadoop-rdf/jena-elephas-io/src/test/java/org/apache/jena/hadoop/rdf/io/registry/TestHadoopRdfIORegistry.java ---------------------------------------------------------------------- diff --git a/jena-hadoop-rdf/jena-elephas-io/src/test/java/org/apache/jena/hadoop/rdf/io/registry/TestHadoopRdfIORegistry.java b/jena-hadoop-rdf/jena-elephas-io/src/test/java/org/apache/jena/hadoop/rdf/io/registry/TestHadoopRdfIORegistry.java deleted file mode 100644 index 2eae232..0000000 --- a/jena-hadoop-rdf/jena-elephas-io/src/test/java/org/apache/jena/hadoop/rdf/io/registry/TestHadoopRdfIORegistry.java +++ /dev/null @@ -1,186 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -package org.apache.jena.hadoop.rdf.io.registry; - -import java.io.IOException; -import java.io.StringWriter; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.NullWritable; -import org.apache.hadoop.mapreduce.RecordReader; -import org.apache.hadoop.mapreduce.RecordWriter; -import org.apache.jena.hadoop.rdf.types.QuadWritable; -import org.apache.jena.hadoop.rdf.types.TripleWritable; -import org.apache.jena.riot.Lang; -import org.apache.jena.riot.RDFLanguages; -import org.junit.Assert; -import org.junit.Test; - -/** - * Tests for the {@link HadoopRdfIORegistry} - */ -public class TestHadoopRdfIORegistry { - - private void testLang(Lang lang, boolean triples, boolean quads, boolean writesSupported) { - Assert.assertEquals(triples, HadoopRdfIORegistry.hasTriplesReader(lang)); - Assert.assertEquals(quads, HadoopRdfIORegistry.hasQuadReader(lang)); - - // Some formats may be asymmetric - if (writesSupported) { - Assert.assertEquals(triples, HadoopRdfIORegistry.hasTriplesWriter(lang)); - Assert.assertEquals(quads, HadoopRdfIORegistry.hasQuadWriter(lang)); - } else { - Assert.assertFalse(HadoopRdfIORegistry.hasTriplesWriter(lang)); - Assert.assertFalse(HadoopRdfIORegistry.hasQuadWriter(lang)); - } - - if (triples) { - // Check that triples are supported - RecordReader<LongWritable, TripleWritable> tripleReader; - try { - tripleReader = HadoopRdfIORegistry.createTripleReader(lang); - Assert.assertNotNull(tripleReader); - } catch (IOException e) { - Assert.fail("Registry indicates that " + lang.getName() - + " can read triples but fails to produce a triple reader when asked: " + e.getMessage()); - } - - if (writesSupported) { - RecordWriter<NullWritable, TripleWritable> tripleWriter; - try { - tripleWriter = HadoopRdfIORegistry.createTripleWriter(lang, new StringWriter(), new Configuration( - false)); - Assert.assertNotNull(tripleWriter); - } catch (IOException e) { - Assert.fail("Registry indicates that " + lang.getName() - + " can write triples but fails to produce a triple writer when asked: " + e.getMessage()); - } - } - } else { - // Check that triples are not supported - try { - HadoopRdfIORegistry.createTripleReader(lang); - Assert.fail("Registry indicates that " + lang.getName() - + " cannot read triples but produced a triple reader when asked (error was expected)"); - } catch (IOException e) { - // This is expected - } - try { - HadoopRdfIORegistry.createTripleWriter(lang, new StringWriter(), new Configuration(false)); - Assert.fail("Registry indicates that " + lang.getName() - + " cannot write triples but produced a triple write when asked (error was expected)"); - } catch (IOException e) { - // This is expected - } - } - - if (quads) { - // Check that quads are supported - RecordReader<LongWritable, QuadWritable> quadReader; - try { - quadReader = HadoopRdfIORegistry.createQuadReader(lang); - Assert.assertNotNull(quadReader); - } catch (IOException e) { - Assert.fail("Registry indicates that " + lang.getName() - + " can read quads but fails to produce a quad reader when asked: " + e.getMessage()); - } - - if (writesSupported) { - RecordWriter<NullWritable, QuadWritable> quadWriter; - try { - quadWriter = HadoopRdfIORegistry.createQuadWriter(lang, new StringWriter(), - new Configuration(false)); - Assert.assertNotNull(quadWriter); - } catch (IOException e) { - Assert.fail("Registry indicates that " + lang.getName() - + " can write quads but fails to produce a triple writer when asked: " + e.getMessage()); - } - } - } else { - try { - HadoopRdfIORegistry.createQuadReader(lang); - Assert.fail("Registry indicates that " + lang.getName() - + " cannot read quads but produced a quad reader when asked (error was expected)"); - } catch (IOException e) { - // This is expected - } - try { - HadoopRdfIORegistry.createQuadWriter(lang, new StringWriter(), new Configuration(false)); - Assert.fail("Registry indicates that " + lang.getName() - + " cannot write quads but produced a quad writer when asked (error was expected)"); - } catch (IOException e) { - // This is expected - } - } - } - - @Test - public void json_ld_registered() { - testLang(Lang.JSONLD, true, true, true); - } - - @Test - public void nquads_registered() { - testLang(Lang.NQUADS, false, true, true); - testLang(Lang.NQ, false, true, true); - } - - @Test - public void ntriples_registered() { - testLang(Lang.NTRIPLES, true, false, true); - testLang(Lang.NT, true, false, true); - } - - @Test - public void rdf_json_registered() { - testLang(Lang.RDFJSON, true, false, true); - } - - @Test - public void rdf_xml_registered() { - testLang(Lang.RDFXML, true, false, true); - } - - @Test - public void rdf_thrift_registered() { - testLang(RDFLanguages.THRIFT, true, true, true); - } - - @Test - public void trig_registered() { - testLang(Lang.TRIG, false, true, true); - } - - @Test - public void trix_registered() { - testLang(Lang.TRIX, false, true, true); - } - - @Test - public void turtle_registered() { - testLang(Lang.TURTLE, true, false, true); - testLang(Lang.TTL, true, false, true); - testLang(Lang.N3, true, false, true); - } - - @Test - public void unregistered() { - testLang(Lang.RDFNULL, false, false, true); - } -} http://git-wip-us.apache.org/repos/asf/jena/blob/49c4cffe/jena-hadoop-rdf/jena-elephas-stats/hadoop-job.xml ---------------------------------------------------------------------- diff --git a/jena-hadoop-rdf/jena-elephas-stats/hadoop-job.xml b/jena-hadoop-rdf/jena-elephas-stats/hadoop-job.xml deleted file mode 100644 index de72645..0000000 --- a/jena-hadoop-rdf/jena-elephas-stats/hadoop-job.xml +++ /dev/null @@ -1,46 +0,0 @@ -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> - -<assembly> - <id>hadoop-job</id> - <formats> - <format>jar</format> - </formats> - <includeBaseDirectory>false</includeBaseDirectory> - <dependencySets> - <dependencySet> - <unpack>false</unpack> - <scope>runtime</scope> - <outputDirectory>lib</outputDirectory> - <excludes> - <exclude>${groupId}:${artifactId}</exclude> - </excludes> - </dependencySet> - <dependencySet> - <unpack>true</unpack> - <includes> - <include>${groupId}:${artifactId}</include> - </includes> - </dependencySet> - </dependencySets> - <fileSets> - <fileSet> - <directory>${basedir}/target/test-classes</directory> - <outputDirectory>/</outputDirectory> - </fileSet> - </fileSets> -</assembly> http://git-wip-us.apache.org/repos/asf/jena/blob/49c4cffe/jena-hadoop-rdf/jena-elephas-stats/pom.xml ---------------------------------------------------------------------- diff --git a/jena-hadoop-rdf/jena-elephas-stats/pom.xml b/jena-hadoop-rdf/jena-elephas-stats/pom.xml deleted file mode 100644 index 899d612..0000000 --- a/jena-hadoop-rdf/jena-elephas-stats/pom.xml +++ /dev/null @@ -1,103 +0,0 @@ -<!-- - Licensed to the Apache Software Foundation (ASF) under one or more - contributor license agreements. See the NOTICE file distributed with - this work for additional information regarding copyright ownership. - The ASF licenses this file to You under the Apache License, Version 2.0 - (the "License"); you may not use this file except in compliance with - the License. You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. ---> - -<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" - xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> - <modelVersion>4.0.0</modelVersion> - <parent> - <groupId>org.apache.jena</groupId> - <artifactId>jena-elephas</artifactId> - <version>0.9.0-SNAPSHOT</version> - </parent> - <artifactId>jena-elephas-stats</artifactId> - <name>Apache Jena - RDF Tools for Hadoop - Statistics Demo App</name> - <description>A demo application that can be run on Hadoop to produce a statistical analysis on arbitrary RDF inputs</description> - - <dependencies> - <!-- Internal Project Dependencies --> - <dependency> - <groupId>org.apache.jena</groupId> - <artifactId>jena-hadoop-rdf-io</artifactId> - <version>${project.version}</version> - </dependency> - <dependency> - <groupId>org.apache.jena</groupId> - <artifactId>jena-hadoop-rdf-mapreduce</artifactId> - <version>${project.version}</version> - </dependency> - - <!-- CLI related Dependencies --> - <dependency> - <groupId>io.airlift</groupId> - <artifactId>airline</artifactId> - <version>0.6</version> - </dependency> - - <!-- Hadoop Dependencies --> - <!-- Note these will be provided on the Hadoop cluster hence the provided - scope --> - <dependency> - <groupId>org.apache.hadoop</groupId> - <artifactId>hadoop-common</artifactId> - <scope>provided</scope> - </dependency> - <dependency> - <groupId>org.apache.hadoop</groupId> - <artifactId>hadoop-mapreduce-client-common</artifactId> - <scope>provided</scope> - </dependency> - - <!-- Test Dependencies --> - <dependency> - <groupId>org.apache.jena</groupId> - <artifactId>jena-hadoop-rdf-mapreduce</artifactId> - <version>${project.version}</version> - <classifier>tests</classifier> - <scope>test</scope> - </dependency> - <dependency> - <groupId>org.apache.mrunit</groupId> - <artifactId>mrunit</artifactId> - <scope>test</scope> - <classifier>hadoop2</classifier> - </dependency> - </dependencies> - - <build> - <plugins> - <!-- Assembly plugin is used to produce the runnable Hadoop JAR with all - dependencies contained therein --> - <plugin> - <artifactId>maven-assembly-plugin</artifactId> - <configuration> - <descriptors> - <descriptor>hadoop-job.xml</descriptor> - </descriptors> - </configuration> - <executions> - <execution> - <id>make-assembly</id> - <phase>package</phase> - <goals> - <goal>single</goal> - </goals> - </execution> - </executions> - </plugin> - </plugins> - </build> -</project> \ No newline at end of file http://git-wip-us.apache.org/repos/asf/jena/blob/49c4cffe/jena-hadoop-rdf/jena-elephas-stats/src/main/java/org/apache/jena/hadoop/rdf/stats/RdfStats.java ---------------------------------------------------------------------- diff --git a/jena-hadoop-rdf/jena-elephas-stats/src/main/java/org/apache/jena/hadoop/rdf/stats/RdfStats.java b/jena-hadoop-rdf/jena-elephas-stats/src/main/java/org/apache/jena/hadoop/rdf/stats/RdfStats.java deleted file mode 100644 index 5f870ee..0000000 --- a/jena-hadoop-rdf/jena-elephas-stats/src/main/java/org/apache/jena/hadoop/rdf/stats/RdfStats.java +++ /dev/null @@ -1,405 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.jena.hadoop.rdf.stats; - -import io.airlift.command.Arguments; -import io.airlift.command.Command; -import io.airlift.command.Help; -import io.airlift.command.HelpOption; -import io.airlift.command.Option; -import io.airlift.command.OptionType; -import io.airlift.command.ParseArgumentsMissingException; -import io.airlift.command.ParseArgumentsUnexpectedException; -import io.airlift.command.ParseException; -import io.airlift.command.ParseOptionMissingException; -import io.airlift.command.ParseOptionMissingValueException; -import io.airlift.command.SingleCommand; -import io.airlift.command.model.CommandMetadata; - -import java.io.IOException; -import java.util.ArrayList; -import java.util.List; -import java.util.concurrent.TimeUnit; - -import javax.inject.Inject; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.util.Tool; -import org.apache.hadoop.util.ToolRunner; -import org.apache.jena.hadoop.rdf.stats.jobs.JobFactory; - - -/** - * Entry point for the Hadoop job, handles launching all the relevant Hadoop - * jobs - */ -@Command(name = "bin/hadoop jar PATH_TO_JAR com.yarcdata.urika.hadoop.rdf.stats.RdfStats", description = "A command which computes statistics on RDF data using Hadoop") -public class RdfStats implements Tool { - - static final String ANSI_RED = "\u001B[31m"; - static final String ANSI_RESET = "\u001B[0m"; - - private static final String DATA_TYPE_TRIPLES = "triples", DATA_TYPE_QUADS = "quads", DATA_TYPE_MIXED = "mixed"; - - /** - * Help option - */ - @Inject - public HelpOption helpOption; - - /** - * Gets/Sets whether all available statistics will be calculated - */ - @Option(name = { "-a", "--all" }, description = "Requests that all available statistics be calculated", type = OptionType.COMMAND) - public boolean all = false; - - /** - * Gets/Sets whether node usage counts will be calculated - */ - @Option(name = { "-n", "--node-count" }, description = "Requests that node usage counts be calculated", type = OptionType.COMMAND) - public boolean nodeCount = false; - - /** - * Gets/Sets whether characteristic sets will be calculated - */ - @Option(name = { "-c", "--characteristic-sets" }, description = "Requests that characteristic sets be calculated", type = OptionType.COMMAND) - public boolean characteristicSets = false; - - /** - * Gets/Sets whether type counts will be calculated - */ - @Option(name = { "-t", "--type-counts" }, description = "Requests that rdf:type usage counts be calculated", type = OptionType.COMMAND) - public boolean typeCount = false; - - /** - * Gets/Sets whether data type counts will be calculated - */ - @Option(name = { "-d", "--data-types" }, description = "Requests that literal data type usage counts be calculated", type = OptionType.COMMAND) - public boolean dataTypeCount = false; - - /** - * Gets/Sets whether namespace counts will be calculated - */ - @Option(name = { "--namespaces" }, description = "Requests that namespace usage counts be calculated", type = OptionType.COMMAND) - public boolean namespaceCount = false; - - /** - * Gets/Sets the input data type used - */ - @Option(name = { "--input-type" }, allowedValues = { DATA_TYPE_MIXED, DATA_TYPE_QUADS, DATA_TYPE_TRIPLES }, description = "Specifies whether the input data is a mixture of quads and triples, just quads or just triples. Using the most specific data type will yield the most accurrate statistics") - public String inputType = DATA_TYPE_MIXED; - - /** - * Gets/Sets the output path - */ - @Option(name = { "-o", "--output" }, title = "OutputPath", description = "Sets the output path", arity = 1, required = true) - public String outputPath = null; - - /** - * Gets/Sets the input path(s) - */ - @Arguments(description = "Sets the input path(s)", title = "InputPath", required = true) - public List<String> inputPaths = new ArrayList<String>(); - - private Configuration config; - - /** - * Entry point method - * - * @param args - * Arguments - * @throws Exception - */ - public static void main(String[] args) throws Exception { - try { - // Run and exit with result code if no errors bubble up - // Note that the exit code may still be a error code - int res = ToolRunner.run(new Configuration(true), new RdfStats(), args); - System.exit(res); - } catch (Exception e) { - System.err.println(ANSI_RED + e.getMessage()); - e.printStackTrace(System.err); - } finally { - System.err.print(ANSI_RESET); - } - // If any errors bubble up exit with non-zero code - System.exit(1); - } - - private static void showUsage() { - CommandMetadata metadata = SingleCommand.singleCommand(RdfStats.class).getCommandMetadata(); - StringBuilder builder = new StringBuilder(); - Help.help(metadata, builder); - System.err.print(ANSI_RESET); - System.err.println(builder.toString()); - System.exit(1); - } - - @Override - public void setConf(Configuration conf) { - this.config = conf; - } - - @Override - public Configuration getConf() { - return this.config; - } - - @Override - public int run(String[] args) throws Exception { - try { - // Parse custom arguments - RdfStats cmd = SingleCommand.singleCommand(RdfStats.class).parse(args); - - // Copy Hadoop configuration across - cmd.setConf(this.getConf()); - - // Show help if requested and exit with success - if (cmd.helpOption.showHelpIfRequested()) { - return 0; - } - - // Run the command and exit with success - cmd.run(); - return 0; - - } catch (ParseOptionMissingException e) { - System.err.println(ANSI_RED + e.getMessage()); - System.err.println(); - showUsage(); - } catch (ParseOptionMissingValueException e) { - System.err.println(ANSI_RED + e.getMessage()); - System.err.println(); - showUsage(); - } catch (ParseArgumentsMissingException e) { - System.err.println(ANSI_RED + e.getMessage()); - System.err.println(); - showUsage(); - } catch (ParseArgumentsUnexpectedException e) { - System.err.println(ANSI_RED + e.getMessage()); - System.err.println(); - showUsage(); - // TODO Re-enable as and when we upgrade Airline - // } catch (ParseOptionIllegalValueException e) { - // System.err.println(ANSI_RED + e.getMessage()); - // System.err.println(); - // showUsage(); - } catch (ParseException e) { - System.err.println(ANSI_RED + e.getMessage()); - System.err.println(); - showUsage(); - } catch (UnsupportedOperationException e) { - System.err.println(ANSI_RED + e.getMessage()); - } catch (Throwable e) { - System.err.println(ANSI_RED + e.getMessage()); - e.printStackTrace(System.err); - } finally { - System.err.print(ANSI_RESET); - } - return 1; - } - - private void run() throws Throwable { - if (!this.outputPath.endsWith("/")) { - this.outputPath += "/"; - } - - // If all statistics requested turn on all statistics - if (this.all) { - this.nodeCount = true; - this.characteristicSets = true; - this.typeCount = true; - this.dataTypeCount = true; - this.namespaceCount = true; - } - - // How many statistics were requested? - int statsRequested = 0; - if (this.nodeCount) - statsRequested++; - if (this.characteristicSets) - statsRequested++; - if (this.typeCount) - statsRequested++; - if (this.dataTypeCount) - statsRequested++; - if (this.namespaceCount) - statsRequested++; - - // Error if no statistics requested - if (statsRequested == 0) { - System.err - .println("You did not request any statistics to be calculated, please use one/more of the relevant options to select the statistics to be computed"); - return; - } - int statsComputed = 1; - - // Compute statistics - if (this.nodeCount) { - Job job = this.selectNodeCountJob(); - statsComputed = this.computeStatistic(job, statsComputed, statsRequested); - } - if (this.typeCount) { - Job[] jobs = this.selectTypeCountJobs(); - statsComputed = this.computeStatistic(jobs, false, false, statsComputed, statsRequested); - } - if (this.dataTypeCount) { - Job job = this.selectDataTypeCountJob(); - statsComputed = this.computeStatistic(job, statsComputed, statsRequested); - } - if (this.namespaceCount) { - Job job = this.selectNamespaceCountJob(); - statsComputed = this.computeStatistic(job, statsComputed, statsRequested); - } - if (this.characteristicSets) { - Job[] jobs = this.selectCharacteristicSetJobs(); - statsComputed = this.computeStatistic(jobs, false, false, statsComputed, statsRequested); - } - } - - private int computeStatistic(Job job, int statsComputed, int statsRequested) throws Throwable { - System.out.println(String.format("Computing Statistic %d of %d requested", statsComputed, statsRequested)); - this.runJob(job); - System.out.println(String.format("Computed Statistic %d of %d requested", statsComputed, statsRequested)); - System.out.println(); - return ++statsComputed; - } - - private int computeStatistic(Job[] jobs, boolean continueOnFailure, boolean continueOnError, int statsComputed, - int statsRequested) { - System.out.println(String.format("Computing Statistic %d of %d requested", statsComputed, statsRequested)); - this.runJobSequence(jobs, continueOnFailure, continueOnError); - System.out.println(String.format("Computed Statistic %d of %d requested", statsComputed, statsRequested)); - System.out.println(); - return ++statsComputed; - } - - private boolean runJob(Job job) throws Throwable { - System.out.println("Submitting Job " + job.getJobName()); - long start = System.nanoTime(); - try { - job.submit(); - if (job.monitorAndPrintJob()) { - System.out.println("Job " + job.getJobName() + " succeeded"); - return true; - } else { - System.out.println("Job " + job.getJobName() + " failed"); - return false; - } - } catch (Throwable e) { - System.out.println("Unexpected failure in Job " + job.getJobName()); - throw e; - } finally { - long end = System.nanoTime(); - System.out.println("Job " + job.getJobName() + " finished after " - + String.format("%,d milliseconds", TimeUnit.NANOSECONDS.toMillis(end - start))); - System.out.println(); - } - } - - private void runJobSequence(Job[] jobs, boolean continueOnFailure, boolean continueOnError) { - for (int i = 0; i < jobs.length; i++) { - Job job = jobs[i]; - try { - boolean success = this.runJob(job); - if (!success && !continueOnFailure) - throw new IllegalStateException("Unable to complete job sequence because Job " + job.getJobName() + " failed"); - } catch (IllegalStateException e) { - throw e; - } catch (Throwable e) { - if (!continueOnError) - throw new IllegalStateException("Unable to complete job sequence because job " + job.getJobName() - + " errorred", e); - } - } - } - - private Job selectNodeCountJob() throws IOException { - String realOutputPath = outputPath + "node-counts/"; - String[] inputs = new String[this.inputPaths.size()]; - this.inputPaths.toArray(inputs); - - if (DATA_TYPE_QUADS.equals(this.inputType)) { - return JobFactory.getQuadNodeCountJob(this.config, inputs, realOutputPath); - } else if (DATA_TYPE_TRIPLES.equals(this.inputType)) { - return JobFactory.getTripleNodeCountJob(this.config, inputs, realOutputPath); - } else { - return JobFactory.getNodeCountJob(this.config, inputs, realOutputPath); - } - } - - private Job selectDataTypeCountJob() throws IOException { - String realOutputPath = outputPath + "data-type-counts/"; - String[] inputs = new String[this.inputPaths.size()]; - this.inputPaths.toArray(inputs); - - if (DATA_TYPE_QUADS.equals(this.inputType)) { - return JobFactory.getQuadDataTypeCountJob(this.config, inputs, realOutputPath); - } else if (DATA_TYPE_TRIPLES.equals(this.inputType)) { - return JobFactory.getTripleDataTypeCountJob(this.config, inputs, realOutputPath); - } else { - return JobFactory.getDataTypeCountJob(this.config, inputs, realOutputPath); - } - } - - private Job selectNamespaceCountJob() throws IOException { - String realOutputPath = outputPath + "namespace-counts/"; - String[] inputs = new String[this.inputPaths.size()]; - this.inputPaths.toArray(inputs); - - if (DATA_TYPE_QUADS.equals(this.inputType)) { - return JobFactory.getQuadNamespaceCountJob(this.config, inputs, realOutputPath); - } else if (DATA_TYPE_TRIPLES.equals(this.inputType)) { - return JobFactory.getTripleNamespaceCountJob(this.config, inputs, realOutputPath); - } else { - return JobFactory.getNamespaceCountJob(this.config, inputs, realOutputPath); - } - } - - private Job[] selectCharacteristicSetJobs() throws IOException { - String intermediateOutputPath = outputPath + "characteristics/intermediate/"; - String finalOutputPath = outputPath + "characteristics/final/"; - String[] inputs = new String[this.inputPaths.size()]; - this.inputPaths.toArray(inputs); - - if (DATA_TYPE_QUADS.equals(this.inputType)) { - return JobFactory.getQuadCharacteristicSetJobs(this.config, inputs, intermediateOutputPath, finalOutputPath); - } else if (DATA_TYPE_TRIPLES.equals(this.inputType)) { - return JobFactory.getTripleCharacteristicSetJobs(this.config, inputs, intermediateOutputPath, finalOutputPath); - } else { - return JobFactory.getCharacteristicSetJobs(this.config, inputs, intermediateOutputPath, finalOutputPath); - } - } - - private Job[] selectTypeCountJobs() throws IOException { - String intermediateOutputPath = outputPath + "type-declarations/"; - String finalOutputPath = outputPath + "type-counts/"; - String[] inputs = new String[this.inputPaths.size()]; - this.inputPaths.toArray(inputs); - - if (DATA_TYPE_QUADS.equals(this.inputType)) { - return JobFactory.getQuadTypeCountJobs(this.config, inputs, intermediateOutputPath, finalOutputPath); - } else if (DATA_TYPE_TRIPLES.equals(this.inputType)) { - return JobFactory.getTripleTypeCountJobs(this.config, inputs, intermediateOutputPath, finalOutputPath); - } else { - return JobFactory.getTypeCountJobs(this.config, inputs, intermediateOutputPath, finalOutputPath); - } - } -} http://git-wip-us.apache.org/repos/asf/jena/blob/49c4cffe/jena-hadoop-rdf/jena-elephas-stats/src/main/java/org/apache/jena/hadoop/rdf/stats/jobs/JobFactory.java ---------------------------------------------------------------------- diff --git a/jena-hadoop-rdf/jena-elephas-stats/src/main/java/org/apache/jena/hadoop/rdf/stats/jobs/JobFactory.java b/jena-hadoop-rdf/jena-elephas-stats/src/main/java/org/apache/jena/hadoop/rdf/stats/jobs/JobFactory.java deleted file mode 100644 index 55bb8af..0000000 --- a/jena-hadoop-rdf/jena-elephas-stats/src/main/java/org/apache/jena/hadoop/rdf/stats/jobs/JobFactory.java +++ /dev/null @@ -1,757 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.jena.hadoop.rdf.stats.jobs; - -import java.io.IOException; - -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.LongWritable; -import org.apache.hadoop.io.NullWritable; -import org.apache.hadoop.io.SequenceFile.CompressionType; -import org.apache.hadoop.io.Text; -import org.apache.hadoop.io.compress.BZip2Codec; -import org.apache.hadoop.mapreduce.Job; -import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; -import org.apache.hadoop.mapreduce.lib.input.NLineInputFormat; -import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat; -import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; -import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; -import org.apache.hadoop.mapreduce.lib.output.TextOutputFormat; -import org.apache.hadoop.util.StringUtils; -import org.apache.jena.hadoop.rdf.io.input.QuadsInputFormat; -import org.apache.jena.hadoop.rdf.io.input.TriplesInputFormat; -import org.apache.jena.hadoop.rdf.io.input.TriplesOrQuadsInputFormat; -import org.apache.jena.hadoop.rdf.io.input.nquads.NQuadsInputFormat; -import org.apache.jena.hadoop.rdf.io.input.ntriples.NTriplesInputFormat; -import org.apache.jena.hadoop.rdf.io.output.nquads.NQuadsOutputFormat; -import org.apache.jena.hadoop.rdf.io.output.ntriples.NTriplesNodeOutputFormat; -import org.apache.jena.hadoop.rdf.io.output.ntriples.NTriplesOutputFormat; -import org.apache.jena.hadoop.rdf.mapreduce.KeyMapper; -import org.apache.jena.hadoop.rdf.mapreduce.RdfMapReduceConstants; -import org.apache.jena.hadoop.rdf.mapreduce.TextCountReducer; -import org.apache.jena.hadoop.rdf.mapreduce.characteristics.CharacteristicSetReducer; -import org.apache.jena.hadoop.rdf.mapreduce.characteristics.QuadCharacteristicSetGeneratingReducer; -import org.apache.jena.hadoop.rdf.mapreduce.characteristics.TripleCharacteristicSetGeneratingReducer; -import org.apache.jena.hadoop.rdf.mapreduce.count.NodeCountReducer; -import org.apache.jena.hadoop.rdf.mapreduce.count.QuadNodeCountMapper; -import org.apache.jena.hadoop.rdf.mapreduce.count.TripleNodeCountMapper; -import org.apache.jena.hadoop.rdf.mapreduce.count.datatypes.QuadDataTypeCountMapper; -import org.apache.jena.hadoop.rdf.mapreduce.count.datatypes.TripleDataTypeCountMapper; -import org.apache.jena.hadoop.rdf.mapreduce.count.namespaces.QuadNamespaceCountMapper; -import org.apache.jena.hadoop.rdf.mapreduce.count.namespaces.TripleNamespaceCountMapper; -import org.apache.jena.hadoop.rdf.mapreduce.count.positional.QuadObjectCountMapper; -import org.apache.jena.hadoop.rdf.mapreduce.count.positional.TripleObjectCountMapper; -import org.apache.jena.hadoop.rdf.mapreduce.filter.positional.QuadFilterByPredicateMapper; -import org.apache.jena.hadoop.rdf.mapreduce.filter.positional.TripleFilterByPredicateUriMapper; -import org.apache.jena.hadoop.rdf.mapreduce.group.QuadGroupBySubjectMapper; -import org.apache.jena.hadoop.rdf.mapreduce.group.TripleGroupBySubjectMapper; -import org.apache.jena.hadoop.rdf.types.CharacteristicSetWritable; -import org.apache.jena.hadoop.rdf.types.NodeWritable; -import org.apache.jena.hadoop.rdf.types.QuadWritable; -import org.apache.jena.hadoop.rdf.types.TripleWritable; - -import com.hp.hpl.jena.vocabulary.RDF; - -/** - * Factory that can produce {@link Job} instances for computing various RDF - * statistics - * - * - * - */ -public class JobFactory { - - /** - * Private constructor prevents instantiation - */ - private JobFactory() { - } - - /** - * Gets a job for computing node counts on RDF triple inputs - * - * @param config - * Configuration - * @param inputPaths - * Input paths - * @param outputPath - * Output path - * @return Job - * @throws IOException - */ - public static Job getTripleNodeCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException { - Job job = Job.getInstance(config); - job.setJarByClass(JobFactory.class); - job.setJobName("RDF Triples Node Usage Count"); - - // Map/Reduce classes - job.setMapperClass(TripleNodeCountMapper.class); - job.setMapOutputKeyClass(NodeWritable.class); - job.setMapOutputValueClass(LongWritable.class); - job.setReducerClass(NodeCountReducer.class); - - // Input and Output - job.setInputFormatClass(TriplesInputFormat.class); - job.setOutputFormatClass(NTriplesNodeOutputFormat.class); - FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths)); - FileOutputFormat.setOutputPath(job, new Path(outputPath)); - - return job; - } - - /** - * Gets a job for computing node counts on RDF quad inputs - * - * @param config - * Configuration - * @param inputPaths - * Input paths - * @param outputPath - * Output path - * @return Job - * @throws IOException - */ - public static Job getQuadNodeCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException { - Job job = Job.getInstance(config); - job.setJarByClass(JobFactory.class); - job.setJobName("RDF Quads Node Usage Count"); - - // Map/Reduce classes - job.setMapperClass(QuadNodeCountMapper.class); - job.setMapOutputKeyClass(NodeWritable.class); - job.setMapOutputValueClass(LongWritable.class); - job.setReducerClass(NodeCountReducer.class); - - // Input and Output - job.setInputFormatClass(QuadsInputFormat.class); - job.setOutputFormatClass(NTriplesNodeOutputFormat.class); - FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths)); - FileOutputFormat.setOutputPath(job, new Path(outputPath)); - - return job; - } - - /** - * Gets a job for computing node counts on RDF triple and/or quad inputs - * - * @param config - * Configuration - * @param inputPaths - * Input paths - * @param outputPath - * Output path - * @return Job - * @throws IOException - */ - public static Job getNodeCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException { - Job job = Job.getInstance(config); - job.setJarByClass(JobFactory.class); - job.setJobName("RDF Node Usage Count"); - - // Map/Reduce classes - job.setMapperClass(QuadNodeCountMapper.class); - job.setMapOutputKeyClass(NodeWritable.class); - job.setMapOutputValueClass(LongWritable.class); - job.setReducerClass(NodeCountReducer.class); - - // Input and Output - job.setInputFormatClass(TriplesOrQuadsInputFormat.class); - job.setOutputFormatClass(NTriplesNodeOutputFormat.class); - FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths)); - FileOutputFormat.setOutputPath(job, new Path(outputPath)); - - return job; - } - - /** - * Gets a sequence of jobs that can be used to compute characteristic sets - * for RDF triples - * - * @param config - * Configuration - * @param inputPaths - * Input paths - * @param intermediateOutputPath - * Intermediate output path - * @param outputPath - * Final output path - * @return Sequence of jobs - * @throws IOException - */ - public static Job[] getTripleCharacteristicSetJobs(Configuration config, String[] inputPaths, String intermediateOutputPath, - String outputPath) throws IOException { - Job[] jobs = new Job[2]; - - Job job = Job.getInstance(config); - job.setJarByClass(JobFactory.class); - job.setJobName("RDF Triples Characteristic Set (Generation)"); - - // Map/Reduce classes - job.setMapperClass(TripleGroupBySubjectMapper.class); - job.setMapOutputKeyClass(NodeWritable.class); - job.setMapOutputValueClass(TripleWritable.class); - job.setReducerClass(TripleCharacteristicSetGeneratingReducer.class); - job.setOutputKeyClass(CharacteristicSetWritable.class); - job.setOutputValueClass(NullWritable.class); - - // Input and Output - job.setInputFormatClass(TriplesInputFormat.class); - job.setOutputFormatClass(SequenceFileOutputFormat.class); - FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths)); - FileOutputFormat.setOutputPath(job, new Path(intermediateOutputPath)); - SequenceFileOutputFormat.setCompressOutput(job, true); - FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class); - SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); - - jobs[0] = job; - - job = Job.getInstance(config); - job.setJarByClass(JobFactory.class); - job.setJobName("RDF Triples Characteristic Set (Reduction)"); - - // Map/Reduce classes - job.setMapperClass(KeyMapper.class); - job.setMapOutputKeyClass(CharacteristicSetWritable.class); - job.setMapOutputValueClass(CharacteristicSetWritable.class); - job.setReducerClass(CharacteristicSetReducer.class); - job.setOutputKeyClass(CharacteristicSetWritable.class); - job.setOutputValueClass(CharacteristicSetWritable.class); - - // Input and Output - job.setInputFormatClass(SequenceFileInputFormat.class); - job.setOutputFormatClass(TextOutputFormat.class); - FileInputFormat.setInputPaths(job, intermediateOutputPath); - FileOutputFormat.setOutputPath(job, new Path(outputPath)); - - jobs[1] = job; - return jobs; - } - - /** - * Gets a sequence of jobs that can be used to compute characteristic sets - * for RDF quads - * - * @param config - * Configuration - * @param inputPaths - * Input paths - * @param intermediateOutputPath - * Intermediate output path - * @param outputPath - * Final output path - * @return Sequence of jobs - * @throws IOException - */ - public static Job[] getQuadCharacteristicSetJobs(Configuration config, String[] inputPaths, String intermediateOutputPath, - String outputPath) throws IOException { - Job[] jobs = new Job[2]; - - Job job = Job.getInstance(config); - job.setJarByClass(JobFactory.class); - job.setJobName("RDF Quads Characteristic Set (Generation)"); - - // Map/Reduce classes - job.setMapperClass(QuadGroupBySubjectMapper.class); - job.setMapOutputKeyClass(NodeWritable.class); - job.setMapOutputValueClass(QuadWritable.class); - job.setReducerClass(QuadCharacteristicSetGeneratingReducer.class); - job.setOutputKeyClass(CharacteristicSetWritable.class); - job.setOutputValueClass(NullWritable.class); - - // Input and Output - job.setInputFormatClass(QuadsInputFormat.class); - job.setOutputFormatClass(SequenceFileOutputFormat.class); - FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths)); - FileOutputFormat.setOutputPath(job, new Path(intermediateOutputPath)); - SequenceFileOutputFormat.setCompressOutput(job, true); - FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class); - SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); - - jobs[0] = job; - - job = Job.getInstance(config); - job.setJarByClass(JobFactory.class); - job.setJobName("RDF Quads Characteristic Set (Reduction)"); - - // Map/Reduce classes - job.setMapperClass(KeyMapper.class); - job.setMapOutputKeyClass(CharacteristicSetWritable.class); - job.setMapOutputValueClass(CharacteristicSetWritable.class); - job.setReducerClass(CharacteristicSetReducer.class); - job.setOutputKeyClass(CharacteristicSetWritable.class); - job.setOutputValueClass(CharacteristicSetWritable.class); - - // Input and Output - job.setInputFormatClass(SequenceFileInputFormat.class); - job.setOutputFormatClass(TextOutputFormat.class); - FileInputFormat.setInputPaths(job, intermediateOutputPath); - FileOutputFormat.setOutputPath(job, new Path(outputPath)); - - jobs[1] = job; - return jobs; - } - - /** - * Gets a sequence of jobs that can be used to compute characteristic sets - * for RDF triple and/or quad inputs - * - * @param config - * Configuration - * @param inputPaths - * Input paths - * @param intermediateOutputPath - * Intermediate output path - * @param outputPath - * Final output path - * @return Sequence of jobs - * @throws IOException - */ - public static Job[] getCharacteristicSetJobs(Configuration config, String[] inputPaths, String intermediateOutputPath, - String outputPath) throws IOException { - Job[] jobs = new Job[2]; - - Job job = Job.getInstance(config); - job.setJarByClass(JobFactory.class); - job.setJobName("RDF Characteristic Set (Generation)"); - - // Map/Reduce classes - job.setMapperClass(QuadGroupBySubjectMapper.class); - job.setMapOutputKeyClass(NodeWritable.class); - job.setMapOutputValueClass(QuadWritable.class); - job.setReducerClass(QuadCharacteristicSetGeneratingReducer.class); - job.setOutputKeyClass(CharacteristicSetWritable.class); - job.setOutputValueClass(NullWritable.class); - - // Input and Output - job.setInputFormatClass(TriplesOrQuadsInputFormat.class); - job.setOutputFormatClass(SequenceFileOutputFormat.class); - FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths)); - FileOutputFormat.setOutputPath(job, new Path(intermediateOutputPath)); - SequenceFileOutputFormat.setCompressOutput(job, true); - FileOutputFormat.setOutputCompressorClass(job, BZip2Codec.class); - SequenceFileOutputFormat.setOutputCompressionType(job, CompressionType.BLOCK); - - jobs[0] = job; - - job = Job.getInstance(config); - job.setJarByClass(JobFactory.class); - job.setJobName("RDF Characteristic Set (Reduction)"); - - // Map/Reduce classes - job.setMapperClass(KeyMapper.class); - job.setMapOutputKeyClass(CharacteristicSetWritable.class); - job.setMapOutputValueClass(CharacteristicSetWritable.class); - job.setReducerClass(CharacteristicSetReducer.class); - job.setOutputKeyClass(CharacteristicSetWritable.class); - job.setOutputValueClass(CharacteristicSetWritable.class); - - // Input and Output - job.setInputFormatClass(SequenceFileInputFormat.class); - job.setOutputFormatClass(TextOutputFormat.class); - FileInputFormat.setInputPaths(job, intermediateOutputPath); - FileOutputFormat.setOutputPath(job, new Path(outputPath)); - - jobs[1] = job; - return jobs; - } - - /** - * Gets a job for computing type counts on RDF triple inputs - * - * @param config - * Configuration - * @param inputPaths - * Input paths - * @param intermediateOutputPath - * Path for intermediate output which will be all the type - * declaration triples present in the inputs - * @param outputPath - * Output path - * @return Job - * @throws IOException - */ - public static Job[] getTripleTypeCountJobs(Configuration config, String[] inputPaths, String intermediateOutputPath, - String outputPath) throws IOException { - Job[] jobs = new Job[2]; - - Job job = Job.getInstance(config); - job.setJarByClass(JobFactory.class); - job.setJobName("RDF Type Triples Extraction"); - - // Map/Reduce classes - job.getConfiguration().setStrings(RdfMapReduceConstants.FILTER_PREDICATE_URIS, RDF.type.getURI()); - job.setMapperClass(TripleFilterByPredicateUriMapper.class); - job.setMapOutputKeyClass(LongWritable.class); - job.setMapOutputValueClass(TripleWritable.class); - - // Input and Output Format - job.setInputFormatClass(TriplesInputFormat.class); - job.setOutputFormatClass(NTriplesOutputFormat.class); - FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths)); - FileOutputFormat.setOutputPath(job, new Path(intermediateOutputPath)); - - jobs[0] = job; - - // Object Node Usage count job - job = Job.getInstance(config); - job.setJarByClass(JobFactory.class); - job.setJobName("RDF Triples Type Usage Count"); - - // Map/Reduce classes - job.setMapperClass(TripleObjectCountMapper.class); - job.setMapOutputKeyClass(NodeWritable.class); - job.setMapOutputValueClass(LongWritable.class); - job.setReducerClass(NodeCountReducer.class); - - // Input and Output - job.setInputFormatClass(NTriplesInputFormat.class); - NLineInputFormat.setNumLinesPerSplit(job, 10000); // TODO Would be - // better if this was - // intelligently - // configured - job.setOutputFormatClass(NTriplesNodeOutputFormat.class); - FileInputFormat.setInputPaths(job, intermediateOutputPath); - FileOutputFormat.setOutputPath(job, new Path(outputPath)); - - jobs[1] = job; - - return jobs; - } - - /** - * Gets a job for computing type counts on RDF quad inputs - * - * @param config - * Configuration - * @param inputPaths - * Input paths - * @param intermediateOutputPath - * Path for intermediate output which will be all the type - * declaration quads present in the inputs - * @param outputPath - * Output path - * @return Job - * @throws IOException - */ - public static Job[] getQuadTypeCountJobs(Configuration config, String[] inputPaths, String intermediateOutputPath, - String outputPath) throws IOException { - Job[] jobs = new Job[2]; - - Job job = Job.getInstance(config); - job.setJarByClass(JobFactory.class); - job.setJobName("RDF Type Quads Extraction"); - - // Map/Reduce classes - job.getConfiguration().setStrings(RdfMapReduceConstants.FILTER_PREDICATE_URIS, RDF.type.getURI()); - job.setMapperClass(QuadFilterByPredicateMapper.class); - job.setMapOutputKeyClass(LongWritable.class); - job.setMapOutputValueClass(QuadWritable.class); - - // Input and Output Format - job.setInputFormatClass(QuadsInputFormat.class); - job.setOutputFormatClass(NQuadsOutputFormat.class); - FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths)); - FileOutputFormat.setOutputPath(job, new Path(intermediateOutputPath)); - - jobs[0] = job; - - // Object Node Usage count job - job = Job.getInstance(config); - job.setJarByClass(JobFactory.class); - job.setJobName("RDF Quads Type Usage Count"); - - // Map/Reduce classes - job.setMapperClass(QuadObjectCountMapper.class); - job.setMapOutputKeyClass(NodeWritable.class); - job.setMapOutputValueClass(LongWritable.class); - job.setReducerClass(NodeCountReducer.class); - - // Input and Output - job.setInputFormatClass(NQuadsInputFormat.class); - NLineInputFormat.setNumLinesPerSplit(job, 10000); // TODO Would be - // better if this was - // intelligently - // configured - job.setOutputFormatClass(NTriplesNodeOutputFormat.class); - FileInputFormat.setInputPaths(job, intermediateOutputPath); - FileOutputFormat.setOutputPath(job, new Path(outputPath)); - - jobs[1] = job; - - return jobs; - } - - /** - * Gets a job for computing type counts on RDF triple and/or quad inputs - * - * @param config - * Configuration - * @param inputPaths - * Input paths - * @param intermediateOutputPath - * Path for intermediate output which will be all the type - * declaration quads present in the inputs - * @param outputPath - * Output path - * @return Job - * @throws IOException - */ - public static Job[] getTypeCountJobs(Configuration config, String[] inputPaths, String intermediateOutputPath, - String outputPath) throws IOException { - Job[] jobs = new Job[2]; - - Job job = Job.getInstance(config); - job.setJarByClass(JobFactory.class); - job.setJobName("RDF Type Extraction"); - - // Map/Reduce classes - job.getConfiguration().setStrings(RdfMapReduceConstants.FILTER_PREDICATE_URIS, RDF.type.getURI()); - job.setMapperClass(QuadFilterByPredicateMapper.class); - job.setMapOutputKeyClass(LongWritable.class); - job.setMapOutputValueClass(QuadWritable.class); - - // Input and Output Format - job.setInputFormatClass(TriplesOrQuadsInputFormat.class); - job.setOutputFormatClass(NQuadsOutputFormat.class); - FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths)); - FileOutputFormat.setOutputPath(job, new Path(intermediateOutputPath)); - - jobs[0] = job; - - // Object Node Usage count job - job = Job.getInstance(config); - job.setJarByClass(JobFactory.class); - job.setJobName("RDF Type Usage Count"); - - // Map/Reduce classes - job.setMapperClass(QuadObjectCountMapper.class); - job.setMapOutputKeyClass(NodeWritable.class); - job.setMapOutputValueClass(LongWritable.class); - job.setReducerClass(NodeCountReducer.class); - - // Input and Output - job.setInputFormatClass(NQuadsInputFormat.class); - NLineInputFormat.setNumLinesPerSplit(job, 10000); // TODO Would be - // better if this was - // intelligently - // configured - job.setOutputFormatClass(NTriplesNodeOutputFormat.class); - FileInputFormat.setInputPaths(job, intermediateOutputPath); - FileOutputFormat.setOutputPath(job, new Path(outputPath)); - - jobs[1] = job; - - return jobs; - } - - /** - * Gets a job for computing literal data type counts on RDF triple inputs - * - * @param config - * Configuration - * @param inputPaths - * Input paths - * @param outputPath - * Output path - * @return Job - * @throws IOException - */ - public static Job getTripleDataTypeCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException { - Job job = Job.getInstance(config); - job.setJarByClass(JobFactory.class); - job.setJobName("RDF Triples Literal Data Type Usage Count"); - - // Map/Reduce classes - job.setMapperClass(TripleDataTypeCountMapper.class); - job.setMapOutputKeyClass(NodeWritable.class); - job.setMapOutputValueClass(LongWritable.class); - job.setReducerClass(NodeCountReducer.class); - - // Input and Output - job.setInputFormatClass(TriplesInputFormat.class); - job.setOutputFormatClass(NTriplesNodeOutputFormat.class); - FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths)); - FileOutputFormat.setOutputPath(job, new Path(outputPath)); - - return job; - } - - /** - * Gets a job for computing literal data type counts on RDF quad inputs - * - * @param config - * Configuration - * @param inputPaths - * Input paths - * @param outputPath - * Output path - * @return Job - * @throws IOException - */ - public static Job getQuadDataTypeCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException { - Job job = Job.getInstance(config); - job.setJarByClass(JobFactory.class); - job.setJobName("RDF Quads Literal Data Type Usage Count"); - - // Map/Reduce classes - job.setMapperClass(QuadDataTypeCountMapper.class); - job.setMapOutputKeyClass(NodeWritable.class); - job.setMapOutputValueClass(LongWritable.class); - job.setReducerClass(NodeCountReducer.class); - - // Input and Output - job.setInputFormatClass(QuadsInputFormat.class); - job.setOutputFormatClass(NTriplesNodeOutputFormat.class); - FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths)); - FileOutputFormat.setOutputPath(job, new Path(outputPath)); - - return job; - } - - /** - * Gets a job for computing literal data type counts on RDF triple and/or - * quad inputs - * - * @param config - * Configuration - * @param inputPaths - * Input paths - * @param outputPath - * Output path - * @return Job - * @throws IOException - */ - public static Job getDataTypeCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException { - Job job = Job.getInstance(config); - job.setJarByClass(JobFactory.class); - job.setJobName("RDF Literal Data Type Usage Count"); - - // Map/Reduce classes - job.setMapperClass(QuadDataTypeCountMapper.class); - job.setMapOutputKeyClass(NodeWritable.class); - job.setMapOutputValueClass(LongWritable.class); - job.setReducerClass(NodeCountReducer.class); - - // Input and Output - job.setInputFormatClass(TriplesOrQuadsInputFormat.class); - job.setOutputFormatClass(NTriplesNodeOutputFormat.class); - FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths)); - FileOutputFormat.setOutputPath(job, new Path(outputPath)); - - return job; - } - - /** - * Gets a job for computing literal data type counts on RDF triple inputs - * - * @param config - * Configuration - * @param inputPaths - * Input paths - * @param outputPath - * Output path - * @return Job - * @throws IOException - */ - public static Job getTripleNamespaceCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException { - Job job = Job.getInstance(config); - job.setJarByClass(JobFactory.class); - job.setJobName("RDF Triples Namespace Usage Count"); - - // Map/Reduce classes - job.setMapperClass(TripleNamespaceCountMapper.class); - job.setMapOutputKeyClass(Text.class); - job.setMapOutputValueClass(LongWritable.class); - job.setReducerClass(TextCountReducer.class); - - // Input and Output - job.setInputFormatClass(TriplesInputFormat.class); - job.setOutputFormatClass(TextOutputFormat.class); - FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths)); - FileOutputFormat.setOutputPath(job, new Path(outputPath)); - - return job; - } - - /** - * Gets a job for computing literal data type counts on RDF quad inputs - * - * @param config - * Configuration - * @param inputPaths - * Input paths - * @param outputPath - * Output path - * @return Job - * @throws IOException - */ - public static Job getQuadNamespaceCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException { - Job job = Job.getInstance(config); - job.setJarByClass(JobFactory.class); - job.setJobName("RDF Quads Namespace Usage Count"); - - // Map/Reduce classes - job.setMapperClass(QuadNamespaceCountMapper.class); - job.setMapOutputKeyClass(Text.class); - job.setMapOutputValueClass(LongWritable.class); - job.setReducerClass(TextCountReducer.class); - - // Input and Output - job.setInputFormatClass(QuadsInputFormat.class); - job.setOutputFormatClass(TextOutputFormat.class); - FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths)); - FileOutputFormat.setOutputPath(job, new Path(outputPath)); - - return job; - } - - /** - * Gets a job for computing literal data type counts on RDF triple and/or - * quad inputs - * - * @param config - * Configuration - * @param inputPaths - * Input paths - * @param outputPath - * Output path - * @return Job - * @throws IOException - */ - public static Job getNamespaceCountJob(Configuration config, String[] inputPaths, String outputPath) throws IOException { - Job job = Job.getInstance(config); - job.setJarByClass(JobFactory.class); - job.setJobName("RDF Namespace Usage Count"); - - // Map/Reduce classes - job.setMapperClass(QuadNamespaceCountMapper.class); - job.setMapOutputKeyClass(Text.class); - job.setMapOutputValueClass(LongWritable.class); - job.setReducerClass(TextCountReducer.class); - - // Input and Output - job.setInputFormatClass(TriplesOrQuadsInputFormat.class); - job.setOutputFormatClass(TextOutputFormat.class); - FileInputFormat.setInputPaths(job, StringUtils.arrayToString(inputPaths)); - FileOutputFormat.setOutputPath(job, new Path(outputPath)); - - return job; - } -} http://git-wip-us.apache.org/repos/asf/jena/blob/49c4cffe/jena-hadoop-rdf/pom.xml ---------------------------------------------------------------------- diff --git a/jena-hadoop-rdf/pom.xml b/jena-hadoop-rdf/pom.xml deleted file mode 100644 index 83f2819..0000000 --- a/jena-hadoop-rdf/pom.xml +++ /dev/null @@ -1,97 +0,0 @@ -<!-- Licensed to the Apache Software Foundation (ASF) under one or more contributor - license agreements. See the NOTICE file distributed with this work for additional - information regarding copyright ownership. The ASF licenses this file to - You under the Apache License, Version 2.0 (the "License"); you may not use - this file except in compliance with the License. You may obtain a copy of - the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required - by applicable law or agreed to in writing, software distributed under the - License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS - OF ANY KIND, either express or implied. See the License for the specific - language governing permissions and limitations under the License. --> -<project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" - xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> - <modelVersion>4.0.0</modelVersion> - <artifactId>jena-elephas</artifactId> - <version>0.9.0-SNAPSHOT</version> - <packaging>pom</packaging> - - <parent> - <groupId>org.apache.jena</groupId> - <artifactId>jena-parent</artifactId> - <version>12-SNAPSHOT</version> - <relativePath>../jena-parent</relativePath> - </parent> - - <name>Apache Jena - Elephas</name> - <description>A collection of tools for working with RDF on the Hadoop platform</description> - - <modules> - <module>jena-elephas-io</module> - <module>jena-elephas-common</module> - <module>jena-elephas-mapreduce</module> - <module>jena-elephas-stats</module> - </modules> - - <!-- Properties common across all profiles --> - <properties> - <plugin.compiler.version>2.5.1</plugin.compiler.version> - <arq.version>2.12.2-SNAPSHOT</arq.version> - <junit.version>4.11</junit.version> - <mrunit.version>1.0.0</mrunit.version> - </properties> - - <!-- Profiles to allow building for different Hadoop versions --> - <profiles> - <!-- Hadoop 2.x Latest --> - <profile> - <id>hadoop_2x</id> - <activation> - <activeByDefault>true</activeByDefault> - </activation> - <properties> - <hadoop.version>2.5.0</hadoop.version> - </properties> - </profile> - </profiles> - - <dependencyManagement> - <dependencies> - <!-- Hadoop Dependencies --> - <dependency> - <groupId>org.apache.hadoop</groupId> - <artifactId>hadoop-common</artifactId> - <version>${hadoop.version}</version> - </dependency> - <dependency> - <groupId>org.apache.hadoop</groupId> - <artifactId>hadoop-mapreduce-client-common</artifactId> - <version>${hadoop.version}</version> - </dependency> - - <!-- Jena Dependencies --> - <dependency> - <groupId>org.apache.jena</groupId> - <artifactId>jena-arq</artifactId> - <version>${arq.version}</version> - </dependency> - - <!-- Test Dependencies --> - <dependency> - <groupId>org.apache.hadoop</groupId> - <artifactId>hadoop-minicluster</artifactId> - <version>${hadoop.version}</version> - </dependency> - <dependency> - <groupId>junit</groupId> - <artifactId>junit</artifactId> - <version>${junit.version}</version> - </dependency> - <dependency> - <groupId>org.apache.mrunit</groupId> - <artifactId>mrunit</artifactId> - <version>${mrunit.version}</version> - <classifier>hadoop2</classifier> - </dependency> - </dependencies> - </dependencyManagement> -</project> http://git-wip-us.apache.org/repos/asf/jena/blob/49c4cffe/pom.xml ---------------------------------------------------------------------- diff --git a/pom.xml b/pom.xml index 89229f0..0a64185 100644 --- a/pom.xml +++ b/pom.xml @@ -77,7 +77,7 @@ <!-- <module>jena-maven-tools</module> --> <!-- <module>apache-jena-libs</module> --> <!-- <module>apache-jena</module> --> - <!-- <module>jena-hadoop-rdf</module> --> + <!-- <module>jena-elephas</module> --> <module>jena-extras</module> </modules> </profile> @@ -116,7 +116,7 @@ <module>jena-jdbc</module> <module>jena-maven-tools</module> - <module>jena-hadoop-rdf</module> + <module>jena-elephas</module> <module>apache-jena-libs</module> <module>apache-jena</module>
