Paul Wu created SPARK-9255: ------------------------------ Summary: Timestamp handling incorrect for Spark 1.4.1 on Linux Key: SPARK-9255 URL: https://issues.apache.org/jira/browse/SPARK-9255 Project: Spark Issue Type: Bug Components: SQL Affects Versions: 1.4.1 Environment: Redhat Linux, Java 8.0 and Spark 1.4.1 release. Reporter: Paul Wu
This is a very strange case involving timestamp I can run the program on Windows using dev pom.xml (1.4.1) or 1.3.1 release downloaded from Apache without issues , but when I ran it on Spark 1.4.1 release either downloaded from Apache or the version built with scala 2.11 on redhat linux, it has the following error (the code I used is after this stack trace): 15/07/22 12:02:50 ERROR Executor 96: Exception in task 0.0 in stage 0.0 (TID 0) java.util.concurrent.ExecutionException: scala.tools.reflect.ToolBoxError: reflective compilation has failed: value < is not a member of TimestampType.this.InternalType at org.spark-project.guava.util.concurrent.AbstractFuture$Sync.getValue(AbstractFuture.java:306) at org.spark-project.guava.util.concurrent.AbstractFuture$Sync.get(AbstractFuture.java:293) at org.spark-project.guava.util.concurrent.AbstractFuture.get(AbstractFuture.java:116) at org.spark-project.guava.util.concurrent.Uninterruptibles.getUninterruptibly(Uninterruptibles.java:135) at org.spark-project.guava.cache.LocalCache$Segment.getAndRecordStats(LocalCache.java:2410) at org.spark-project.guava.cache.LocalCache$Segment.loadSync(LocalCache.java:2380) at org.spark-project.guava.cache.LocalCache$Segment.lockedGetOrLoad(LocalCache.java:2342) at org.spark-project.guava.cache.LocalCache$Segment.get(LocalCache.java:2257) at org.spark-project.guava.cache.LocalCache.get(LocalCache.java:4000) at org.spark-project.guava.cache.LocalCache.getOrLoad(LocalCache.java:4004) at org.spark-project.guava.cache.LocalCache$LocalLoadingCache.get(LocalCache.java:4874) at org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator.generate(CodeGenerator.scala:105) at org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator.generate(CodeGenerator.scala:102) at org.apache.spark.sql.execution.SparkPlan.newMutableProjection(SparkPlan.scala:170) at org.apache.spark.sql.execution.GeneratedAggregate$$anonfun$9.apply(GeneratedAggregate.scala:261) at org.apache.spark.sql.execution.GeneratedAggregate$$anonfun$9.apply(GeneratedAggregate.scala:246) at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$17.apply(RDD.scala:686) at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$17.apply(RDD.scala:686) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277) at org.apache.spark.rdd.RDD.iterator(RDD.scala:244) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277) at org.apache.spark.rdd.RDD.iterator(RDD.scala:244) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:70) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41) at org.apache.spark.scheduler.Task.run(Task.scala:70) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745) Caused by: scala.tools.reflect.ToolBoxError: reflective compilation has failed: value < is not a member of TimestampType.this.InternalType at scala.tools.reflect.ToolBoxFactory$ToolBoxImpl$ToolBoxGlobal.throwIfErrors(ToolBoxFactory.scala:316) at scala.tools.reflect.ToolBoxFactory$ToolBoxImpl$ToolBoxGlobal.wrapInPackageAndCompile(ToolBoxFactory.scala:198) at scala.tools.reflect.ToolBoxFactory$ToolBoxImpl$ToolBoxGlobal.compile(ToolBoxFactory.scala:252) at scala.tools.reflect.ToolBoxFactory$ToolBoxImpl$$anonfun$compile$2.apply(ToolBoxFactory.scala:429) at scala.tools.reflect.ToolBoxFactory$ToolBoxImpl$$anonfun$compile$2.apply(ToolBoxFactory.scala:422) at scala.tools.reflect.ToolBoxFactory$ToolBoxImpl$withCompilerApi$.liftedTree2$1(ToolBoxFactory.scala:355) at scala.tools.reflect.ToolBoxFactory$ToolBoxImpl$withCompilerApi$.apply(ToolBoxFactory.scala:355) at scala.tools.reflect.ToolBoxFactory$ToolBoxImpl.compile(ToolBoxFactory.scala:422) at scala.tools.reflect.ToolBoxFactory$ToolBoxImpl.eval(ToolBoxFactory.scala:444) at org.apache.spark.sql.catalyst.expressions.codegen.GenerateMutableProjection$.create(GenerateMutableProjection.scala:74) at org.apache.spark.sql.catalyst.expressions.codegen.GenerateMutableProjection$.create(GenerateMutableProjection.scala:26) at org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator$$anon$1.load(CodeGenerator.scala:92) at org.spark-project.guava.cache.LocalCache$LoadingValueReference.loadFuture(LocalCache.java:3599) at org.spark-project.guava.cache.LocalCache$Segment.loadSync(LocalCache.java:2379) ... 25 more 15/07/22 12:02:50 ERROR TaskSetManager 75: Task 0 in stage 0.0 failed 1 times; aborting job Exception in thread "main" org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 1 times, most recent failure: Lost task 0.0 in stage 0.0 (TID 0, localhost): java.util.concurrent.ExecutionException: scala.tools.reflect.ToolBoxError: reflective compilation has failed: value < is not a member of TimestampType.this.InternalType at org.spark-project.guava.util.concurrent.AbstractFuture$Sync.getValue(AbstractFuture.java:306) at org.spark-project.guava.util.concurrent.AbstractFuture$Sync.get(AbstractFuture.java:293) at org.spark-project.guava.util.concurrent.AbstractFuture.get(AbstractFuture.java:116) at org.spark-project.guava.util.concurrent.Uninterruptibles.getUninterruptibly(Uninterruptibles.java:135) at org.spark-project.guava.cache.LocalCache$Segment.getAndRecordStats(LocalCache.java:2410) at org.spark-project.guava.cache.LocalCache$Segment.loadSync(LocalCache.java:2380) at org.spark-project.guava.cache.LocalCache$Segment.lockedGetOrLoad(LocalCache.java:2342) at org.spark-project.guava.cache.LocalCache$Segment.get(LocalCache.java:2257) at org.spark-project.guava.cache.LocalCache.get(LocalCache.java:4000) at org.spark-project.guava.cache.LocalCache.getOrLoad(LocalCache.java:4004) at org.spark-project.guava.cache.LocalCache$LocalLoadingCache.get(LocalCache.java:4874) at org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator.generate(CodeGenerator.scala:105) at org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator.generate(CodeGenerator.scala:102) at org.apache.spark.sql.execution.SparkPlan.newMutableProjection(SparkPlan.scala:170) at org.apache.spark.sql.execution.GeneratedAggregate$$anonfun$9.apply(GeneratedAggregate.scala:261) at org.apache.spark.sql.execution.GeneratedAggregate$$anonfun$9.apply(GeneratedAggregate.scala:246) at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$17.apply(RDD.scala:686) at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$17.apply(RDD.scala:686) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277) at org.apache.spark.rdd.RDD.iterator(RDD.scala:244) at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35) at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277) at org.apache.spark.rdd.RDD.iterator(RDD.scala:244) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:70) at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41) at org.apache.spark.scheduler.Task.run(Task.scala:70) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745) Caused by: scala.tools.reflect.ToolBoxError: reflective compilation has failed: value < is not a member of TimestampType.this.InternalType at scala.tools.reflect.ToolBoxFactory$ToolBoxImpl$ToolBoxGlobal.throwIfErrors(ToolBoxFactory.scala:316) at scala.tools.reflect.ToolBoxFactory$ToolBoxImpl$ToolBoxGlobal.wrapInPackageAndCompile(ToolBoxFactory.scala:198) at scala.tools.reflect.ToolBoxFactory$ToolBoxImpl$ToolBoxGlobal.compile(ToolBoxFactory.scala:252) at scala.tools.reflect.ToolBoxFactory$ToolBoxImpl$$anonfun$compile$2.apply(ToolBoxFactory.scala:429) at scala.tools.reflect.ToolBoxFactory$ToolBoxImpl$$anonfun$compile$2.apply(ToolBoxFactory.scala:422) at scala.tools.reflect.ToolBoxFactory$ToolBoxImpl$withCompilerApi$.liftedTree2$1(ToolBoxFactory.scala:355) at scala.tools.reflect.ToolBoxFactory$ToolBoxImpl$withCompilerApi$.apply(ToolBoxFactory.scala:355) at scala.tools.reflect.ToolBoxFactory$ToolBoxImpl.compile(ToolBoxFactory.scala:422) at scala.tools.reflect.ToolBoxFactory$ToolBoxImpl.eval(ToolBoxFactory.scala:444) at org.apache.spark.sql.catalyst.expressions.codegen.GenerateMutableProjection$.create(GenerateMutableProjection.scala:74) at org.apache.spark.sql.catalyst.expressions.codegen.GenerateMutableProjection$.create(GenerateMutableProjection.scala:26) at org.apache.spark.sql.catalyst.expressions.codegen.CodeGenerator$$anon$1.load(CodeGenerator.scala:92) at org.spark-project.guava.cache.LocalCache$LoadingValueReference.loadFuture(LocalCache.java:3599) at org.spark-project.guava.cache.LocalCache$Segment.loadSync(LocalCache.java:2379) ... 25 more Driver stacktrace: at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1273) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1264) at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1263) at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59) at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:48) at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1263) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730) at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:730) at scala.Option.foreach(Option.scala:257) at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:730) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1457) at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1418) at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48) ================================================ Code: TruncHour.java /* * To change this license header, choose License Headers in Project Properties. * To change this template file, choose Tools | Templates * and open the template in the editor. */ package zwu.spark.sample; import java.sql.Timestamp; import org.apache.spark.sql.api.java.UDF1; public class TruncHour implements UDF1<Timestamp, Timestamp> { @Override public Timestamp call(Timestamp t1) throws Exception { t1.setMinutes(0); t1.setSeconds(0); t1.setNanos(0); return t1; //throw new UnsupportedOperationException("Not supported yet."); //To change body of generated methods, choose Tools | Templates. } } TimestampSample.java: package zwu.spark.sample; import java.io.Serializable; import java.sql.Timestamp; import java.util.ArrayList; import java.util.Date; import java.util.List; import org.apache.log4j.Logger; import org.apache.spark.SparkConf; import org.apache.spark.api.java.JavaRDD; import org.apache.spark.api.java.JavaSparkContext; import org.apache.spark.sql.DataFrame; import org.apache.spark.sql.types.DataTypes; /** * * @author zw251y */ public class TimestampSample { private final static Logger log = Logger.getLogger(TimestampSample.class); public static void main(String[] args) { SparkConf sparkConf = new SparkConf().setAppName("TimeSample"); //if (PUtil.isWindows) { sparkConf.setMaster("local[1]"); //} JavaSparkContext sc = new JavaSparkContext(sparkConf); org.apache.spark.sql.hive.HiveContext sqlContext = new org.apache.spark.sql.hive.HiveContext(sc.sc()); List<TimeBean> tbList = new ArrayList<>(); TimeBean tb = new TimeBean(); tb.ts = new Timestamp(new Date().getTime()); tbList.add(tb); JavaRDD<TimeBean> mytable = sc.parallelize(tbList); //Apply a schema to an RDD of JavaBeans and register it as a table. //DataFrame schemaPeople = sqlContext.applySchema(people, TimeBean.class); DataFrame schemaPeople = sqlContext.createDataFrame(mytable, TimeBean.class); schemaPeople.registerTempTable("timetable"); sqlContext.udf().register("truncHour", new TruncHour(), DataTypes.TimestampType); // SQL can be run over RDDs that have been registered as tables. //org.apache.spark.sql.types.TimestampType p = null; //p. String test = "select p from (SELECT min(ts) p , count(ts) from timetable group by truncHour(ts)) as mytable group by p "; DataFrame df = sqlContext.sql(test); df.show(); } public static class TimeBean implements Serializable { private String name; private Timestamp ts; /** * Get the value of ts * * @return the value of ts */ public Timestamp getTs() { return ts; } /** * Set the value of ts * * @param d new value of ts */ public void setTs(Timestamp d) { this.ts = d; } public String getName() { return name; } public void setName(String name) { this.name = name; } } } -- This message was sent by Atlassian JIRA (v6.3.4#6332) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org