keith-turner commented on a change in pull request #39: Created Accumulo/Spark 
example
URL: https://github.com/apache/accumulo-examples/pull/39#discussion_r269311770
 
 

 ##########
 File path: spark/src/main/java/org/apache/accumulo/spark/CopyPlus5K.java
 ##########
 @@ -0,0 +1,124 @@
+package org.apache.accumulo.spark;
+
+import java.util.Properties;
+
+import org.apache.accumulo.core.client.Accumulo;
+import org.apache.accumulo.core.client.AccumuloClient;
+import org.apache.accumulo.core.client.BatchWriter;
+import org.apache.accumulo.core.client.MutationsRejectedException;
+import org.apache.accumulo.core.data.Key;
+import org.apache.accumulo.core.data.Mutation;
+import org.apache.accumulo.core.data.Value;
+import org.apache.accumulo.hadoop.mapreduce.AccumuloFileOutputFormat;
+import org.apache.accumulo.hadoop.mapreduce.AccumuloInputFormat;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+
+public class CopyPlus5K {
+
+  public static void main(String[] args) throws Exception {
+
+    if ((!args[0].equals("batch") && !args[0].equals("bulk")) || 
args[1].isEmpty()) {
+        System.out.println("Usage: ./run.sh [batch|bulk] 
/path/to/accumulo-client.properties");
+        System.exit(1);
+    }
+
+    final String inputTable = "spark_example_input";
+    final String outputTable = "spark_example_output";
+    final Properties props = 
Accumulo.newClientProperties().from(args[1]).build();
+
+    try (AccumuloClient client = Accumulo.newClient().from(props).build()) {
+      // Delete tables (if they exist) and create new tables
+      if (client.tableOperations().exists(inputTable)) {
+        client.tableOperations().delete(inputTable);
+      }
+      client.tableOperations().create(inputTable);
+      if (client.tableOperations().exists(outputTable)) {
+        client.tableOperations().delete(outputTable);
+      }
+      client.tableOperations().create(outputTable);
+      // Write data to input table
+      try (BatchWriter bw = client.createBatchWriter(inputTable)) {
+        for (int i = 0; i < 100; i++) {
+          Mutation m = new Mutation(String.format("%09d", i));
+          m.at().family("cf1").qualifier("cq1").put("" + i);
+          bw.addMutation(m);
+        }
+      }
+    }
+
+    SparkConf sparkConf = new SparkConf();
+    sparkConf.setAppName("CopyPlus5K");
+
+    JavaSparkContext sc = new JavaSparkContext(sparkConf);
+
+    Job job = Job.getInstance();
+
+    // Read input from Accumulo
+    
AccumuloInputFormat.configure().clientProperties(props).table(inputTable).store(job);
+    JavaPairRDD<Key,Value> data = sc.newAPIHadoopRDD(job.getConfiguration(),
+        AccumuloInputFormat.class, Key.class, Value.class);
+
+    // Add 5K to all values
+    JavaPairRDD<Key, Value> dataPlus5K = data.mapValues(v ->
+        new Value("" + (Integer.parseInt(v.toString()) + 5_000)));
+
+    if (args[0].equals("batch")) {
+      // Write output using batch writer
+      dataPlus5K.foreachPartition(iter -> {
 
 Review comment:
   could put bulk and batch code in method to make main shorter

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
us...@infra.apache.org


With regards,
Apache Git Services

Reply via email to