spark: GroupByShuffler.java ShuffleTran.java SortByShuffler.java SparkShuffler.java

xuefu Thu, 31 Jul 2014 03:39:04 -0700

Author: xuefu
Date: Thu Jul 31 10:38:20 2014
New Revision: 1614850

URL: http://svn.apache.org/r1614850
Log:
HIVE-7526: Research to use groupby transformation to replace Hive existing 
partitionByKey and SparkCollector combination [Spark Branch] (missing new files 
and deleted file


Added:
    
hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/GroupByShuffler.java
    
hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/SortByShuffler.java
    
hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/SparkShuffler.java
Removed:
    
hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/ShuffleTran.java

Added: 
hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/GroupByShuffler.java
URL: 
http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/GroupByShuffler.java?rev=1614850&view=auto
==============================================================================
--- 
hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/GroupByShuffler.java
 (added)
+++ 
hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/GroupByShuffler.java
 Thu Jul 31 10:38:20 2014
@@ -0,0 +1,14 @@
+package org.apache.hadoop.hive.ql.exec.spark;
+
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.spark.api.java.JavaPairRDD;
+
+public class GroupByShuffler implements SparkShuffler {
+
+  @Override
+  public JavaPairRDD<BytesWritable, Iterable<BytesWritable>> shuffle(
+      JavaPairRDD<BytesWritable, BytesWritable> input) {
+    return input.groupByKey(/* default to hash partition */);
+  }
+
+}

Added: 
hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/SortByShuffler.java
URL: 
http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/SortByShuffler.java?rev=1614850&view=auto
==============================================================================
--- 
hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/SortByShuffler.java
 (added)
+++ 
hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/SortByShuffler.java
 Thu Jul 31 10:38:20 2014
@@ -0,0 +1,83 @@
+/**
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.exec.spark;
+
+import java.util.Iterator;
+
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.spark.api.java.JavaPairRDD;
+import org.apache.spark.api.java.function.PairFlatMapFunction;
+
+import scala.Tuple2;
+
+public class SortByShuffler implements SparkShuffler {
+
+  @Override
+  public JavaPairRDD<BytesWritable, Iterable<BytesWritable>> shuffle(
+      JavaPairRDD<BytesWritable, BytesWritable> input) {
+    JavaPairRDD<BytesWritable, BytesWritable> rdd = input.sortByKey();
+    return rdd.mapPartitionsToPair(new ShuffleFunction());
+  };
+
+  private static class ShuffleFunction implements
+  PairFlatMapFunction<Iterator<Tuple2<BytesWritable, BytesWritable>>,
+  BytesWritable, Iterable<BytesWritable>> {
+    // make eclipse happy
+    private static final long serialVersionUID = 1L;
+
+    @Override
+    public Iterable<Tuple2<BytesWritable, Iterable<BytesWritable>>> call(
+        final Iterator<Tuple2<BytesWritable, BytesWritable>> it) throws 
Exception {
+      // Use input iterator to back returned iterable object.
+      final Iterator<Tuple2<BytesWritable, Iterable<BytesWritable>>> resultIt 
= 
+          new Iterator<Tuple2<BytesWritable, Iterable<BytesWritable>>>() {
+        BytesWritable curKey = null;
+        BytesWritable curValue = null;
+
+        @Override
+        public boolean hasNext() {
+          return it.hasNext() || curKey != null;
+        }
+
+        @Override
+        public Tuple2<BytesWritable, Iterable<BytesWritable>> next() {
+          // TODO: implement this by accumulating rows with the same key into 
a list.
+          // Note that this list needs to improved to prevent excessive memory 
usage, but this
+          // can be done in later phase.
+          return null;
+        }
+
+        @Override
+        public void remove() {
+          // Not implemented.
+          // throw Unsupported Method Invocation Exception.
+        }
+
+      };
+
+      return new Iterable<Tuple2<BytesWritable, Iterable<BytesWritable>>>() {
+        @Override
+        public Iterator<Tuple2<BytesWritable, Iterable<BytesWritable>>> 
iterator() {
+          return resultIt;
+        }
+      };
+    }
+  }
+
+}

Added: 
hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/SparkShuffler.java
URL: 
http://svn.apache.org/viewvc/hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/SparkShuffler.java?rev=1614850&view=auto
==============================================================================
--- 
hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/SparkShuffler.java
 (added)
+++ 
hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/spark/SparkShuffler.java
 Thu Jul 31 10:38:20 2014
@@ -0,0 +1,29 @@
+/**
+ *  Licensed to the Apache Software Foundation (ASF) under one
+ *  or more contributor license agreements.  See the NOTICE file
+ *  distributed with this work for additional information
+ *  regarding copyright ownership.  The ASF licenses this file
+ *  to you under the Apache License, Version 2.0 (the
+ *  "License"); you may not use this file except in compliance
+ *  with the License.  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+package org.apache.hadoop.hive.ql.exec.spark;
+
+import org.apache.hadoop.io.BytesWritable;
+import org.apache.spark.api.java.JavaPairRDD;
+
+public interface SparkShuffler {
+
+  JavaPairRDD<BytesWritable, Iterable<BytesWritable>> shuffle(
+      JavaPairRDD<BytesWritable, BytesWritable> input);
+
+}

svn commit: r1614850 - in /hive/branches/spark/ql/src/java/org/apache/hadoop/hive/ql/exec/spark: GroupByShuffler.java ShuffleTran.java SortByShuffler.java SparkShuffler.java

Reply via email to