Mateusz Michalowski created SPARK-9135: ------------------------------------------
Summary: Filter fails when filtering with a method reference to overloaded method Key: SPARK-9135 URL: https://issues.apache.org/jira/browse/SPARK-9135 Project: Spark Issue Type: Bug Components: Java API Affects Versions: 1.4.0 Reporter: Mateusz Michalowski Filter fails when filtering with a method reference to overloaded method. In the example below we filter by Fruit::isRed, which is overloaded by Apple::isRed and Banana::isRed. If we call {quote} apples.filter(Fruit::isRed) {quote} and then {quote} bananas.filter(Fruit::isRed) {quote} or {quote} fruit.filter(Fruit::isRed) {quote} Spark will try to cast Apple::isRed to Banana::isRed - and then throw as a result. No exception will happen if we use lambda instead of method reference: {code} .filter(f -> f.isRed()) {code} {code:java} package com.doggybites; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.junit.After; import org.junit.Before; import org.junit.Test; import java.io.Serializable; import java.util.Arrays; import static org.hamcrest.CoreMatchers.equalTo; import static org.junit.Assert.assertThat; public class SparkTest { static abstract class Fruit implements Serializable { abstract boolean isRed(); } static class Banana extends Fruit { @Override boolean isRed() { return false; } } static class Apple extends Fruit { @Override boolean isRed() { return true; } } private JavaSparkContext sparkContext; @Before public void setUp() throws Exception { SparkConf sparkConf = new SparkConf().setAppName("test").setMaster("local[2]"); sparkContext = new JavaSparkContext(sparkConf); } @After public void tearDown() throws Exception { sparkContext.stop(); } private <T> JavaRDD<T> toRdd(T ... array) { return sparkContext.parallelize(Arrays.asList(array)); } @Test public void filters_apples_and_bananas_with_method_reference() { JavaRDD<Apple> appleRdd = toRdd(new Apple()); JavaRDD<Banana> bananaRdd = toRdd(new Banana()); long redAppleCount = appleRdd.filter(Fruit::isRed).count(); long redBananaCount = bananaRdd.filter(Fruit::isRed).count(); assertThat(redAppleCount, equalTo(1L)); assertThat(redBananaCount, equalTo(0L)); } } {code} The test above throws: {quote} 15/07/17 14:10:04 ERROR Executor: Exception in task 1.0 in stage 1.0 (TID 3) java.lang.ClassCastException: com.doggybites.SparkTest$Banana cannot be cast to com.doggybites.SparkTest$Apple at com.doggybites.SparkTest$$Lambda$2/976119300.call(Unknown Source) at org.apache.spark.api.java.JavaRDD$$anonfun$filter$1.apply(JavaRDD.scala:78) at org.apache.spark.api.java.JavaRDD$$anonfun$filter$1.apply(JavaRDD.scala:78) at scala.collection.Iterator$$anon$14.hasNext(Iterator.scala:390) at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1626) at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1099) at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1099) at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767) at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63) at org.apache.spark.scheduler.Task.run(Task.scala:70) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745) 15/07/17 14:10:04 WARN TaskSetManager: Lost task 1.0 in stage 1.0 (TID 3, localhost): java.lang.ClassCastException: com.doggybites.SparkTest$Banana cannot be cast to com.doggybites.SparkTest$Apple at com.doggybites.SparkTest$$Lambda$2/976119300.call(Unknown Source) at org.apache.spark.api.java.JavaRDD$$anonfun$filter$1.apply(JavaRDD.scala:78) at org.apache.spark.api.java.JavaRDD$$anonfun$filter$1.apply(JavaRDD.scala:78) at scala.collection.Iterator$$anon$14.hasNext(Iterator.scala:390) at org.apache.spark.util.Utils$.getIteratorSize(Utils.scala:1626) at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1099) at org.apache.spark.rdd.RDD$$anonfun$count$1.apply(RDD.scala:1099) at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767) at org.apache.spark.SparkContext$$anonfun$runJob$5.apply(SparkContext.scala:1767) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:63) at org.apache.spark.scheduler.Task.run(Task.scala:70) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745) {quote} -- This message was sent by Atlassian JIRA (v6.3.4#6332) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org