flatMapGroups.

rxin Mon, 23 Nov 2015 22:22:53 -0800

Repository: spark
Updated Branches:
  refs/heads/branch-1.6 4904e10a2 -> 2561976dc



[SPARK-11933][SQL] Rename mapGroup -> mapGroups and flatMapGroup -> 
flatMapGroups.

Based on feedback from Matei, this is more consistent with mapPartitions in 
Spark.

Also addresses some of the cleanups from a previous commit that renames the 
type variables.

Author: Reynold Xin <[email protected]>

Closes #9919 from rxin/SPARK-11933.

(cherry picked from commit 8d57524662fad4a0760f3bc924e690c2a110e7f7)
Signed-off-by: Reynold Xin <[email protected]>


Project: http://git-wip-us.apache.org/repos/asf/spark/repo
Commit: http://git-wip-us.apache.org/repos/asf/spark/commit/2561976d
Tree: http://git-wip-us.apache.org/repos/asf/spark/tree/2561976d
Diff: http://git-wip-us.apache.org/repos/asf/spark/diff/2561976d

Branch: refs/heads/branch-1.6
Commit: 2561976dcc428d0f633b222700b0bfdd9c5c99a6
Parents: 4904e10
Author: Reynold Xin <[email protected]>
Authored: Mon Nov 23 22:22:15 2015 -0800
Committer: Reynold Xin <[email protected]>
Committed: Mon Nov 23 22:22:23 2015 -0800

----------------------------------------------------------------------
 .../api/java/function/FlatMapGroupFunction.java | 28 ---------------
 .../java/function/FlatMapGroupsFunction.java    | 28 +++++++++++++++
 .../api/java/function/MapGroupFunction.java     | 28 ---------------
 .../api/java/function/MapGroupsFunction.java    | 28 +++++++++++++++
 .../org/apache/spark/sql/GroupedDataset.scala   | 36 ++++++++++----------
 .../org/apache/spark/sql/JavaDatasetSuite.java  | 10 +++---
 .../spark/sql/DatasetPrimitiveSuite.scala       |  4 +--
 .../org/apache/spark/sql/DatasetSuite.scala     | 12 +++----
 8 files changed, 87 insertions(+), 87 deletions(-)
----------------------------------------------------------------------


http://git-wip-us.apache.org/repos/asf/spark/blob/2561976d/core/src/main/java/org/apache/spark/api/java/function/FlatMapGroupFunction.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/spark/api/java/function/FlatMapGroupFunction.java
 
b/core/src/main/java/org/apache/spark/api/java/function/FlatMapGroupFunction.java
deleted file mode 100644
index 18a2d73..0000000
--- 
a/core/src/main/java/org/apache/spark/api/java/function/FlatMapGroupFunction.java
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.api.java.function;
-
-import java.io.Serializable;
-import java.util.Iterator;
-
-/**
- * A function that returns zero or more output records from each grouping key 
and its values.
- */
-public interface FlatMapGroupFunction<K, V, R> extends Serializable {
-  Iterable<R> call(K key, Iterator<V> values) throws Exception;
-}

http://git-wip-us.apache.org/repos/asf/spark/blob/2561976d/core/src/main/java/org/apache/spark/api/java/function/FlatMapGroupsFunction.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/spark/api/java/function/FlatMapGroupsFunction.java
 
b/core/src/main/java/org/apache/spark/api/java/function/FlatMapGroupsFunction.java
new file mode 100644
index 0000000..d7a80e7
--- /dev/null
+++ 
b/core/src/main/java/org/apache/spark/api/java/function/FlatMapGroupsFunction.java
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.api.java.function;
+
+import java.io.Serializable;
+import java.util.Iterator;
+
+/**
+ * A function that returns zero or more output records from each grouping key 
and its values.
+ */
+public interface FlatMapGroupsFunction<K, V, R> extends Serializable {
+  Iterable<R> call(K key, Iterator<V> values) throws Exception;
+}

http://git-wip-us.apache.org/repos/asf/spark/blob/2561976d/core/src/main/java/org/apache/spark/api/java/function/MapGroupFunction.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/spark/api/java/function/MapGroupFunction.java 
b/core/src/main/java/org/apache/spark/api/java/function/MapGroupFunction.java
deleted file mode 100644
index 4f3f222..0000000
--- 
a/core/src/main/java/org/apache/spark/api/java/function/MapGroupFunction.java
+++ /dev/null
@@ -1,28 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *    http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.spark.api.java.function;
-
-import java.io.Serializable;
-import java.util.Iterator;
-
-/**
- * Base interface for a map function used in GroupedDataset's mapGroup 
function.
- */
-public interface MapGroupFunction<K, V, R> extends Serializable {
-  R call(K key, Iterator<V> values) throws Exception;
-}

http://git-wip-us.apache.org/repos/asf/spark/blob/2561976d/core/src/main/java/org/apache/spark/api/java/function/MapGroupsFunction.java
----------------------------------------------------------------------
diff --git 
a/core/src/main/java/org/apache/spark/api/java/function/MapGroupsFunction.java 
b/core/src/main/java/org/apache/spark/api/java/function/MapGroupsFunction.java
new file mode 100644
index 0000000..faa59ea
--- /dev/null
+++ 
b/core/src/main/java/org/apache/spark/api/java/function/MapGroupsFunction.java
@@ -0,0 +1,28 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *    http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.spark.api.java.function;
+
+import java.io.Serializable;
+import java.util.Iterator;
+
+/**
+ * Base interface for a map function used in GroupedDataset's mapGroup 
function.
+ */
+public interface MapGroupsFunction<K, V, R> extends Serializable {
+  R call(K key, Iterator<V> values) throws Exception;
+}

http://git-wip-us.apache.org/repos/asf/spark/blob/2561976d/sql/core/src/main/scala/org/apache/spark/sql/GroupedDataset.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/GroupedDataset.scala 
b/sql/core/src/main/scala/org/apache/spark/sql/GroupedDataset.scala
index 7f43ce1..793a86b 100644
--- a/sql/core/src/main/scala/org/apache/spark/sql/GroupedDataset.scala
+++ b/sql/core/src/main/scala/org/apache/spark/sql/GroupedDataset.scala
@@ -43,7 +43,7 @@ import org.apache.spark.sql.expressions.Aggregator
 @Experimental
 class GroupedDataset[K, V] private[sql](
     kEncoder: Encoder[K],
-    tEncoder: Encoder[V],
+    vEncoder: Encoder[V],
     val queryExecution: QueryExecution,
     private val dataAttributes: Seq[Attribute],
     private val groupingAttributes: Seq[Attribute]) extends Serializable {
@@ -53,12 +53,12 @@ class GroupedDataset[K, V] private[sql](
   // queryexecution.
 
   private implicit val unresolvedKEncoder = encoderFor(kEncoder)
-  private implicit val unresolvedTEncoder = encoderFor(tEncoder)
+  private implicit val unresolvedVEncoder = encoderFor(vEncoder)
 
   private val resolvedKEncoder =
     unresolvedKEncoder.resolve(groupingAttributes, OuterScopes.outerScopes)
-  private val resolvedTEncoder =
-    unresolvedTEncoder.resolve(dataAttributes, OuterScopes.outerScopes)
+  private val resolvedVEncoder =
+    unresolvedVEncoder.resolve(dataAttributes, OuterScopes.outerScopes)
 
   private def logicalPlan = queryExecution.analyzed
   private def sqlContext = queryExecution.sqlContext
@@ -76,7 +76,7 @@ class GroupedDataset[K, V] private[sql](
   def keyAs[L : Encoder]: GroupedDataset[L, V] =
     new GroupedDataset(
       encoderFor[L],
-      unresolvedTEncoder,
+      unresolvedVEncoder,
       queryExecution,
       dataAttributes,
       groupingAttributes)
@@ -110,13 +110,13 @@ class GroupedDataset[K, V] private[sql](
    *
    * @since 1.6.0
    */
-  def flatMapGroup[U : Encoder](f: (K, Iterator[V]) => TraversableOnce[U]): 
Dataset[U] = {
+  def flatMapGroups[U : Encoder](f: (K, Iterator[V]) => TraversableOnce[U]): 
Dataset[U] = {
     new Dataset[U](
       sqlContext,
       MapGroups(
         f,
         resolvedKEncoder,
-        resolvedTEncoder,
+        resolvedVEncoder,
         groupingAttributes,
         logicalPlan))
   }
@@ -138,8 +138,8 @@ class GroupedDataset[K, V] private[sql](
    *
    * @since 1.6.0
    */
-  def flatMapGroup[U](f: FlatMapGroupFunction[K, V, U], encoder: Encoder[U]): 
Dataset[U] = {
-    flatMapGroup((key, data) => f.call(key, data.asJava).asScala)(encoder)
+  def flatMapGroups[U](f: FlatMapGroupsFunction[K, V, U], encoder: 
Encoder[U]): Dataset[U] = {
+    flatMapGroups((key, data) => f.call(key, data.asJava).asScala)(encoder)
   }
 
   /**
@@ -158,9 +158,9 @@ class GroupedDataset[K, V] private[sql](
    *
    * @since 1.6.0
    */
-  def mapGroup[U : Encoder](f: (K, Iterator[V]) => U): Dataset[U] = {
+  def mapGroups[U : Encoder](f: (K, Iterator[V]) => U): Dataset[U] = {
     val func = (key: K, it: Iterator[V]) => Iterator(f(key, it))
-    flatMapGroup(func)
+    flatMapGroups(func)
   }
 
   /**
@@ -179,8 +179,8 @@ class GroupedDataset[K, V] private[sql](
    *
    * @since 1.6.0
    */
-  def mapGroup[U](f: MapGroupFunction[K, V, U], encoder: Encoder[U]): 
Dataset[U] = {
-    mapGroup((key, data) => f.call(key, data.asJava))(encoder)
+  def mapGroups[U](f: MapGroupsFunction[K, V, U], encoder: Encoder[U]): 
Dataset[U] = {
+    mapGroups((key, data) => f.call(key, data.asJava))(encoder)
   }
 
   /**
@@ -192,8 +192,8 @@ class GroupedDataset[K, V] private[sql](
   def reduce(f: (V, V) => V): Dataset[(K, V)] = {
     val func = (key: K, it: Iterator[V]) => Iterator((key, it.reduce(f)))
 
-    implicit val resultEncoder = ExpressionEncoder.tuple(unresolvedKEncoder, 
unresolvedTEncoder)
-    flatMapGroup(func)
+    implicit val resultEncoder = ExpressionEncoder.tuple(unresolvedKEncoder, 
unresolvedVEncoder)
+    flatMapGroups(func)
   }
 
   /**
@@ -213,7 +213,7 @@ class GroupedDataset[K, V] private[sql](
 
   private def withEncoder(c: Column): Column = c match {
     case tc: TypedColumn[_, _] =>
-      tc.withInputType(resolvedTEncoder.bind(dataAttributes), dataAttributes)
+      tc.withInputType(resolvedVEncoder.bind(dataAttributes), dataAttributes)
     case _ => c
   }
 
@@ -227,7 +227,7 @@ class GroupedDataset[K, V] private[sql](
     val encoders = columns.map(_.encoder)
     val namedColumns =
       columns.map(
-        _.withInputType(resolvedTEncoder, dataAttributes).named)
+        _.withInputType(resolvedVEncoder, dataAttributes).named)
     val keyColumn = if (groupingAttributes.length > 1) {
       Alias(CreateStruct(groupingAttributes), "key")()
     } else {
@@ -304,7 +304,7 @@ class GroupedDataset[K, V] private[sql](
   def cogroup[U, R : Encoder](
       other: GroupedDataset[K, U])(
       f: (K, Iterator[V], Iterator[U]) => TraversableOnce[R]): Dataset[R] = {
-    implicit def uEnc: Encoder[U] = other.unresolvedTEncoder
+    implicit def uEnc: Encoder[U] = other.unresolvedVEncoder
     new Dataset[R](
       sqlContext,
       CoGroup(

http://git-wip-us.apache.org/repos/asf/spark/blob/2561976d/sql/core/src/test/java/test/org/apache/spark/sql/JavaDatasetSuite.java
----------------------------------------------------------------------
diff --git 
a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDatasetSuite.java 
b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDatasetSuite.java
index cf335ef..67a3190 100644
--- a/sql/core/src/test/java/test/org/apache/spark/sql/JavaDatasetSuite.java
+++ b/sql/core/src/test/java/test/org/apache/spark/sql/JavaDatasetSuite.java
@@ -170,7 +170,7 @@ public class JavaDatasetSuite implements Serializable {
       }
     }, Encoders.INT());
 
-    Dataset<String> mapped = grouped.mapGroup(new MapGroupFunction<Integer, 
String, String>() {
+    Dataset<String> mapped = grouped.mapGroups(new MapGroupsFunction<Integer, 
String, String>() {
       @Override
       public String call(Integer key, Iterator<String> values) throws 
Exception {
         StringBuilder sb = new StringBuilder(key.toString());
@@ -183,8 +183,8 @@ public class JavaDatasetSuite implements Serializable {
 
     Assert.assertEquals(Arrays.asList("1a", "3foobar"), 
mapped.collectAsList());
 
-    Dataset<String> flatMapped = grouped.flatMapGroup(
-      new FlatMapGroupFunction<Integer, String, String>() {
+    Dataset<String> flatMapped = grouped.flatMapGroups(
+      new FlatMapGroupsFunction<Integer, String, String>() {
         @Override
         public Iterable<String> call(Integer key, Iterator<String> values) 
throws Exception {
           StringBuilder sb = new StringBuilder(key.toString());
@@ -249,8 +249,8 @@ public class JavaDatasetSuite implements Serializable {
     GroupedDataset<Integer, String> grouped =
       ds.groupBy(length(col("value"))).keyAs(Encoders.INT());
 
-    Dataset<String> mapped = grouped.mapGroup(
-      new MapGroupFunction<Integer, String, String>() {
+    Dataset<String> mapped = grouped.mapGroups(
+      new MapGroupsFunction<Integer, String, String>() {
         @Override
         public String call(Integer key, Iterator<String> data) throws 
Exception {
           StringBuilder sb = new StringBuilder(key.toString());

http://git-wip-us.apache.org/repos/asf/spark/blob/2561976d/sql/core/src/test/scala/org/apache/spark/sql/DatasetPrimitiveSuite.scala
----------------------------------------------------------------------
diff --git 
a/sql/core/src/test/scala/org/apache/spark/sql/DatasetPrimitiveSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/DatasetPrimitiveSuite.scala
index d387710..f75d096 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetPrimitiveSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetPrimitiveSuite.scala
@@ -86,7 +86,7 @@ class DatasetPrimitiveSuite extends QueryTest with 
SharedSQLContext {
   test("groupBy function, map") {
     val ds = Seq(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11).toDS()
     val grouped = ds.groupBy(_ % 2)
-    val agged = grouped.mapGroup { case (g, iter) =>
+    val agged = grouped.mapGroups { case (g, iter) =>
       val name = if (g == 0) "even" else "odd"
       (name, iter.size)
     }
@@ -99,7 +99,7 @@ class DatasetPrimitiveSuite extends QueryTest with 
SharedSQLContext {
   test("groupBy function, flatMap") {
     val ds = Seq("a", "b", "c", "xyz", "hello").toDS()
     val grouped = ds.groupBy(_.length)
-    val agged = grouped.flatMapGroup { case (g, iter) => Iterator(g.toString, 
iter.mkString) }
+    val agged = grouped.flatMapGroups { case (g, iter) => Iterator(g.toString, 
iter.mkString) }
 
     checkAnswer(
       agged,

http://git-wip-us.apache.org/repos/asf/spark/blob/2561976d/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
----------------------------------------------------------------------
diff --git a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala 
b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
index cc8e432..dbdd7ba 100644
--- a/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
+++ b/sql/core/src/test/scala/org/apache/spark/sql/DatasetSuite.scala
@@ -224,7 +224,7 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
   test("groupBy function, map") {
     val ds = Seq(("a", 10), ("a", 20), ("b", 1), ("b", 2), ("c", 1)).toDS()
     val grouped = ds.groupBy(v => (v._1, "word"))
-    val agged = grouped.mapGroup { case (g, iter) => (g._1, 
iter.map(_._2).sum) }
+    val agged = grouped.mapGroups { case (g, iter) => (g._1, 
iter.map(_._2).sum) }
 
     checkAnswer(
       agged,
@@ -234,7 +234,7 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
   test("groupBy function, flatMap") {
     val ds = Seq(("a", 10), ("a", 20), ("b", 1), ("b", 2), ("c", 1)).toDS()
     val grouped = ds.groupBy(v => (v._1, "word"))
-    val agged = grouped.flatMapGroup { case (g, iter) =>
+    val agged = grouped.flatMapGroups { case (g, iter) =>
       Iterator(g._1, iter.map(_._2).sum.toString)
     }
 
@@ -255,7 +255,7 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
   test("groupBy columns, map") {
     val ds = Seq(("a", 10), ("a", 20), ("b", 1), ("b", 2), ("c", 1)).toDS()
     val grouped = ds.groupBy($"_1")
-    val agged = grouped.mapGroup { case (g, iter) => (g.getString(0), 
iter.map(_._2).sum) }
+    val agged = grouped.mapGroups { case (g, iter) => (g.getString(0), 
iter.map(_._2).sum) }
 
     checkAnswer(
       agged,
@@ -265,7 +265,7 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
   test("groupBy columns asKey, map") {
     val ds = Seq(("a", 10), ("a", 20), ("b", 1), ("b", 2), ("c", 1)).toDS()
     val grouped = ds.groupBy($"_1").keyAs[String]
-    val agged = grouped.mapGroup { case (g, iter) => (g, iter.map(_._2).sum) }
+    val agged = grouped.mapGroups { case (g, iter) => (g, iter.map(_._2).sum) }
 
     checkAnswer(
       agged,
@@ -275,7 +275,7 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
   test("groupBy columns asKey tuple, map") {
     val ds = Seq(("a", 10), ("a", 20), ("b", 1), ("b", 2), ("c", 1)).toDS()
     val grouped = ds.groupBy($"_1", lit(1)).keyAs[(String, Int)]
-    val agged = grouped.mapGroup { case (g, iter) => (g, iter.map(_._2).sum) }
+    val agged = grouped.mapGroups { case (g, iter) => (g, iter.map(_._2).sum) }
 
     checkAnswer(
       agged,
@@ -285,7 +285,7 @@ class DatasetSuite extends QueryTest with SharedSQLContext {
   test("groupBy columns asKey class, map") {
     val ds = Seq(("a", 10), ("a", 20), ("b", 1), ("b", 2), ("c", 1)).toDS()
     val grouped = ds.groupBy($"_1".as("a"), lit(1).as("b")).keyAs[ClassData]
-    val agged = grouped.mapGroup { case (g, iter) => (g, iter.map(_._2).sum) }
+    val agged = grouped.mapGroups { case (g, iter) => (g, iter.map(_._2).sum) }
 
     checkAnswer(
       agged,


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

spark git commit: [SPARK-11933][SQL] Rename mapGroup -> mapGroups and flatMapGroup -> flatMapGroups.

Reply via email to