keith-turner commented on a change in pull request #39: Created Accumulo/Spark example URL: https://github.com/apache/accumulo-examples/pull/39#discussion_r269311770
########## File path: spark/src/main/java/org/apache/accumulo/spark/CopyPlus5K.java ########## @@ -0,0 +1,124 @@ +package org.apache.accumulo.spark; + +import java.util.Properties; + +import org.apache.accumulo.core.client.Accumulo; +import org.apache.accumulo.core.client.AccumuloClient; +import org.apache.accumulo.core.client.BatchWriter; +import org.apache.accumulo.core.client.MutationsRejectedException; +import org.apache.accumulo.core.data.Key; +import org.apache.accumulo.core.data.Mutation; +import org.apache.accumulo.core.data.Value; +import org.apache.accumulo.hadoop.mapreduce.AccumuloFileOutputFormat; +import org.apache.accumulo.hadoop.mapreduce.AccumuloInputFormat; +import org.apache.hadoop.conf.Configuration; +import org.apache.hadoop.fs.FileSystem; +import org.apache.hadoop.fs.Path; +import org.apache.hadoop.mapreduce.Job; +import org.apache.spark.SparkConf; +import org.apache.spark.api.java.JavaPairRDD; +import org.apache.spark.api.java.JavaSparkContext; + +public class CopyPlus5K { + + public static void main(String[] args) throws Exception { + + if ((!args[0].equals("batch") && !args[0].equals("bulk")) || args[1].isEmpty()) { + System.out.println("Usage: ./run.sh [batch|bulk] /path/to/accumulo-client.properties"); + System.exit(1); + } + + final String inputTable = "spark_example_input"; + final String outputTable = "spark_example_output"; + final Properties props = Accumulo.newClientProperties().from(args[1]).build(); + + try (AccumuloClient client = Accumulo.newClient().from(props).build()) { + // Delete tables (if they exist) and create new tables + if (client.tableOperations().exists(inputTable)) { + client.tableOperations().delete(inputTable); + } + client.tableOperations().create(inputTable); + if (client.tableOperations().exists(outputTable)) { + client.tableOperations().delete(outputTable); + } + client.tableOperations().create(outputTable); + // Write data to input table + try (BatchWriter bw = client.createBatchWriter(inputTable)) { + for (int i = 0; i < 100; i++) { + Mutation m = new Mutation(String.format("%09d", i)); + m.at().family("cf1").qualifier("cq1").put("" + i); + bw.addMutation(m); + } + } + } + + SparkConf sparkConf = new SparkConf(); + sparkConf.setAppName("CopyPlus5K"); + + JavaSparkContext sc = new JavaSparkContext(sparkConf); + + Job job = Job.getInstance(); + + // Read input from Accumulo + AccumuloInputFormat.configure().clientProperties(props).table(inputTable).store(job); + JavaPairRDD<Key,Value> data = sc.newAPIHadoopRDD(job.getConfiguration(), + AccumuloInputFormat.class, Key.class, Value.class); + + // Add 5K to all values + JavaPairRDD<Key, Value> dataPlus5K = data.mapValues(v -> + new Value("" + (Integer.parseInt(v.toString()) + 5_000))); + + if (args[0].equals("batch")) { + // Write output using batch writer + dataPlus5K.foreachPartition(iter -> { Review comment: could put bulk and batch code in method to make main shorter ---------------------------------------------------------------- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org With regards, Apache Git Services