[GitHub] [accumulo-examples] keith-turner commented on a change in pull request #39: Created Accumulo/Spark example

GitBox Wed, 27 Mar 2019 14:37:39 -0700

keith-turner commented on a change in pull request #39: Created Accumulo/Spark 
example
URL: https://github.com/apache/accumulo-examples/pull/39#discussion_r269780361


 ##########
 File path: spark/src/main/java/org/apache/accumulo/spark/CopyPlus5K.java
 ##########
 @@ -0,0 +1,155 @@
+package org.apache.accumulo.spark;
+
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.List;
+import java.util.Properties;
+
+import org.apache.accumulo.core.client.Accumulo;
+import org.apache.accumulo.core.client.AccumuloClient;
+import org.apache.accumulo.core.client.BatchWriter;
+import org.apache.accumulo.core.client.MutationsRejectedException;
+import org.apache.accumulo.core.data.Key;
+import org.apache.accumulo.core.data.Mutation;
+import org.apache.accumulo.core.data.Value;
+import org.apache.accumulo.hadoop.mapreduce.AccumuloFileOutputFormat;
+import org.apache.accumulo.hadoop.mapreduce.AccumuloInputFormat;
+import org.apache.hadoop.conf.Configuration;
+import org.apache.hadoop.fs.FileSystem;
+import org.apache.hadoop.fs.Path;
+import org.apache.hadoop.mapreduce.Job;
+import org.apache.spark.Partitioner;
+import org.apache.spark.SparkConf;
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.JavaSparkContext;
+
+public class CopyPlus5K {
+
+  public static class AccumuloRangePartitioner extends Partitioner {
+
+    private static final long serialVersionUID = 1L;
+    private List<String> splits;
+
+    AccumuloRangePartitioner(String... listSplits) {
+      this.splits = Arrays.asList(listSplits);
+    }
+
+    @Override
+    public int getPartition(Object o) {
+      int index = Collections.binarySearch(splits, 
((Key)o).getRow().toString());
+      index = index < 0 ? (index + 1) * -1 : index;
+      return index;
+    }
+
+    @Override
+    public int numPartitions() {
+      return splits.size() + 1;
+    }
+  }
+
+  private static void cleanupAndCreateTables(Properties props) throws 
Exception {
+    FileSystem hdfs = FileSystem.get(new Configuration());
+    if (hdfs.exists(rootPath)) {
+      hdfs.delete(rootPath, true);
+    }
+    try (AccumuloClient client = Accumulo.newClient().from(props).build()) {
+      if (client.tableOperations().exists(inputTable)) {
+        client.tableOperations().delete(inputTable);
+      }
+      if (client.tableOperations().exists(outputTable)) {
+        client.tableOperations().delete(outputTable);
+      }
+      // Create tables
+      client.tableOperations().create(inputTable);
+      client.tableOperations().create(outputTable);
+
+      // Write data to input table
+      try (BatchWriter bw = client.createBatchWriter(inputTable)) {
+        for (int i = 0; i < 100; i++) {
+          Mutation m = new Mutation(String.format("%03d", i));
+          m.at().family("cf1").qualifier("cq1").put("" + i);
+          bw.addMutation(m);
+        }
+      }
+    }
+  }
+
+  private static final String inputTable = "spark_example_input";
+  private static final String outputTable = "spark_example_output";
+  private static final Path rootPath = new Path("/spark_example/");
+
+  public static void main(String[] args) throws Exception {
+
+    if ((!args[0].equals("batch") && !args[0].equals("bulk")) || 
args[1].isEmpty()) {
+        System.out.println("Usage: ./run.sh [batch|bulk] 
/path/to/accumulo-client.properties");
+        System.exit(1);
+    }
+
+    // Read client properties from file
+    final Properties props = 
Accumulo.newClientProperties().from(args[1]).build();
+
+    cleanupAndCreateTables(props);
+
+    SparkConf conf = new SparkConf();
+    conf.setAppName("CopyPlus5K");
+    // KryoSerializer is needed for serializing Accumulo Key when partitioning 
data for bulk import
+    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer");
+    conf.registerKryoClasses(new Class[]{Key.class, Value.class, 
Properties.class});
+
+    JavaSparkContext sc = new JavaSparkContext(conf);
+
+    Job job = Job.getInstance();
+
+    // Read input from Accumulo
+    
AccumuloInputFormat.configure().clientProperties(props).table(inputTable).store(job);
+    JavaPairRDD<Key,Value> data = sc.newAPIHadoopRDD(job.getConfiguration(),
+        AccumuloInputFormat.class, Key.class, Value.class);
+
+    // Add 5K to all values
+    JavaPairRDD<Key, Value> dataPlus5K = data.mapValues(v ->
+        new Value("" + (Integer.parseInt(v.toString()) + 5_000)));
+
+    if (args[0].equals("batch")) {
+      // Write output using batch writer
+      dataPlus5K.foreachPartition(iter -> {
+        try (AccumuloClient client = Accumulo.newClient().from(props).build();
 
 Review comment:
   Comment like the following may be useful.
   
   ```java
    // Intentionally created an Accumulo client for each partition to avoid 
attempting to serialize it and send it to 
    // each remote process.
    try (AccumuloClient client = Accumulo.newClient().from(props).build();
   ```

----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.
 
For queries about this service, please contact Infrastructure at:
[email protected]


With regards,
Apache Git Services

[GitHub] [accumulo-examples] keith-turner commented on a change in pull request #39: Created Accumulo/Spark example

Reply via email to