[jira] [Updated] (SPARK-14065) serialize MapStatuses in serial model

2016-03-22 Thread xukun (JIRA)

 [ 
https://issues.apache.org/jira/browse/SPARK-14065?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

xukun updated SPARK-14065:
--
Description: 
 the query like this:
{quote}
use tpcds_parquet_30720; 
drop table temp; 
create table temp as select ws_order_number, sum1, sum2, sum1 * sum2 
from (select ws_order_number, count(1) as sum1 from web_sales group by 
ws_order_number) x 
join (select wr_order_number, count(1) as sum2 from web_returns group by 
wr_order_number) y 
on x.ws_order_number = y.wr_order_number;
{quote}

Executor log shows  'GetMapOutputStatuses RpcTimeoutException'
{code:borderStyle=solid}
2016-03-22 11:48:48,388 | WARN  | [Executor task launch worker-1] | Error 
sending message [message = GetMapOutputStatuses(1)] in 1 attempts | 
org.apache.spark.Logging$class.logWarning(Logging.scala:92)
org.apache.spark.rpc.RpcTimeoutException: Futures timed out after [120 
seconds]. This timeout is controlled by spark.network.timeout
at 
org.apache.spark.rpc.RpcTimeout.org$apache$spark$rpc$RpcTimeout$$createRpcTimeoutException(RpcTimeout.scala:48)
at 
org.apache.spark.rpc.RpcTimeout$$anonfun$addMessageIfTimeout$1.applyOrElse(RpcTimeout.scala:63)
at 
org.apache.spark.rpc.RpcTimeout$$anonfun$addMessageIfTimeout$1.applyOrElse(RpcTimeout.scala:59)
at 
scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:33)
at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:76)
at 
org.apache.spark.rpc.RpcEndpointRef.askWithRetry(RpcEndpointRef.scala:101)
at 
org.apache.spark.rpc.RpcEndpointRef.askWithRetry(RpcEndpointRef.scala:77)
at 
org.apache.spark.MapOutputTracker.askTracker(MapOutputTracker.scala:98)
at 
org.apache.spark.MapOutputTracker.getMapSizesByExecutorId(MapOutputTracker.scala:156)
at 
org.apache.spark.shuffle.hash.HashShuffleReader.read(HashShuffleReader.scala:47)
at 
org.apache.spark.sql.execution.ShuffledRowRDD.compute(ShuffledRowRDD.scala:71)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:301)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:265)
at 
org.apache.spark.rdd.MapPartitionsWithPreparationRDD.compute(MapPartitionsWithPreparationRDD.scala:63)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:301)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:265)
at 
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:301)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:265)
at 
org.apache.spark.rdd.MapPartitionsWithPreparationRDD.compute(MapPartitionsWithPreparationRDD.scala:63)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:301)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:265)
at 
org.apache.spark.rdd.ZippedPartitionsRDD2.compute(ZippedPartitionsRDD.scala:99)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:301)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:265)
at 
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:301)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:265)
at 
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:301)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:265)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:68)
at org.apache.spark.scheduler.Task.run(Task.scala:90)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:229)
at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.util.concurrent.TimeoutException: Futures timed out after [120 
seconds]
at scala.concurrent.impl.Promise$DefaultPromise.ready(Promise.scala:219)
at 
scala.concurrent.impl.Promise$DefaultPromise.result(Promise.scala:223)
at scala.concurrent.Await$$anonfun$result$1.apply(package.scala:107)
at 
scala.concurrent.BlockContext$DefaultBlockContext$.blockOn(BlockContext.scala:53)
at scala.concurrent.Await$.result(package.scala:107)
at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
... 32 more
2016-03-22 11:48:52,044 | INFO  | [Executor task launch worker-1] | Got the 
output locations | org.apache.spark.Logging$class.logInfo(Logging.scala:59)
{code}

Driver log shows serialize mapStatus is serial, the procedure is slow. 
{code:borderStyle=solid}
Line 192882: 16/03/22 11:46:54 INFO [dispatcher-event-loop-25] 
MapOutputTrackerMaster: Size of output statuses for shuffle 1 is 

[jira] [Updated] (SPARK-14065) serialize MapStatuses in serial model

2016-03-22 Thread xukun (JIRA)

 [ 
https://issues.apache.org/jira/browse/SPARK-14065?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

xukun updated SPARK-14065:
--
Description: 
 the query like this:
{quote}
use tpcds_parquet_30720; 
drop table temp; 
create table temp as select ws_order_number, sum1, sum2, sum1 * sum2 
from (select ws_order_number, count(1) as sum1 from web_sales group by 
ws_order_number) x 
join (select wr_order_number, count(1) as sum2 from web_returns group by 
wr_order_number) y 
on x.ws_order_number = y.wr_order_number;
{quote}

Executor log shows  'GetMapOutputStatuses RpcTimeoutException'
{code:borderStyle=solid}
2016-03-22 11:48:48,388 | WARN  | [Executor task launch worker-1] | Error 
sending message [message = GetMapOutputStatuses(1)] in 1 attempts | 
org.apache.spark.Logging$class.logWarning(Logging.scala:92)
org.apache.spark.rpc.RpcTimeoutException: Futures timed out after [120 
seconds]. This timeout is controlled by spark.network.timeout
at 
org.apache.spark.rpc.RpcTimeout.org$apache$spark$rpc$RpcTimeout$$createRpcTimeoutException(RpcTimeout.scala:48)
at 
org.apache.spark.rpc.RpcTimeout$$anonfun$addMessageIfTimeout$1.applyOrElse(RpcTimeout.scala:63)
at 
org.apache.spark.rpc.RpcTimeout$$anonfun$addMessageIfTimeout$1.applyOrElse(RpcTimeout.scala:59)
at 
scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:33)
at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:76)
at 
org.apache.spark.rpc.RpcEndpointRef.askWithRetry(RpcEndpointRef.scala:101)
at 
org.apache.spark.rpc.RpcEndpointRef.askWithRetry(RpcEndpointRef.scala:77)
at 
org.apache.spark.MapOutputTracker.askTracker(MapOutputTracker.scala:98)
at 
org.apache.spark.MapOutputTracker.getMapSizesByExecutorId(MapOutputTracker.scala:156)
at 
org.apache.spark.shuffle.hash.HashShuffleReader.read(HashShuffleReader.scala:47)
at 
org.apache.spark.sql.execution.ShuffledRowRDD.compute(ShuffledRowRDD.scala:71)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:301)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:265)
at 
org.apache.spark.rdd.MapPartitionsWithPreparationRDD.compute(MapPartitionsWithPreparationRDD.scala:63)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:301)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:265)
at 
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:301)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:265)
at 
org.apache.spark.rdd.MapPartitionsWithPreparationRDD.compute(MapPartitionsWithPreparationRDD.scala:63)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:301)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:265)
at 
org.apache.spark.rdd.ZippedPartitionsRDD2.compute(ZippedPartitionsRDD.scala:99)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:301)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:265)
at 
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:301)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:265)
at 
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:301)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:265)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:68)
at org.apache.spark.scheduler.Task.run(Task.scala:90)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:229)
at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.util.concurrent.TimeoutException: Futures timed out after [120 
seconds]
at scala.concurrent.impl.Promise$DefaultPromise.ready(Promise.scala:219)
at 
scala.concurrent.impl.Promise$DefaultPromise.result(Promise.scala:223)
at scala.concurrent.Await$$anonfun$result$1.apply(package.scala:107)
at 
scala.concurrent.BlockContext$DefaultBlockContext$.blockOn(BlockContext.scala:53)
at scala.concurrent.Await$.result(package.scala:107)
at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
... 32 more
2016-03-22 11:48:52,044 | INFO  | [Executor task launch worker-1] | Got the 
output locations | org.apache.spark.Logging$class.logInfo(Logging.scala:59)
{code}

Driver log shows serialize mapStatus is serial
{code:borderStyle=solid}
Line 192882: 16/03/22 11:46:54 INFO [dispatcher-event-loop-25] 
MapOutputTrackerMaster: Size of output statuses for shuffle 1 is 10298063 bytes

[jira] [Updated] (SPARK-14065) serialize MapStatuses in serial model

2016-03-22 Thread xukun (JIRA)

 [ 
https://issues.apache.org/jira/browse/SPARK-14065?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

xukun updated SPARK-14065:
--
Description: 
 the query like this:
_
use tpcds_parquet_30720; 
drop table temp; 
create table temp as select ws_order_number, sum1, sum2, sum1 * sum2 
from (select ws_order_number, count(1) as sum1 from web_sales group by 
ws_order_number) x 
join (select wr_order_number, count(1) as sum2 from web_returns group by 
wr_order_number) y 
on x.ws_order_number = y.wr_order_number;
_

Executor log shows GetMapOutputStatuses timeout:
{code:borderStyle=solid}
2016-03-22 11:48:48,388 | WARN  | [Executor task launch worker-1] | Error 
sending message [message = GetMapOutputStatuses(1)] in 1 attempts | 
org.apache.spark.Logging$class.logWarning(Logging.scala:92)
org.apache.spark.rpc.RpcTimeoutException: Futures timed out after [120 
seconds]. This timeout is controlled by spark.network.timeout
at 
org.apache.spark.rpc.RpcTimeout.org$apache$spark$rpc$RpcTimeout$$createRpcTimeoutException(RpcTimeout.scala:48)
at 
org.apache.spark.rpc.RpcTimeout$$anonfun$addMessageIfTimeout$1.applyOrElse(RpcTimeout.scala:63)
at 
org.apache.spark.rpc.RpcTimeout$$anonfun$addMessageIfTimeout$1.applyOrElse(RpcTimeout.scala:59)
at 
scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:33)
at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:76)
at 
org.apache.spark.rpc.RpcEndpointRef.askWithRetry(RpcEndpointRef.scala:101)
at 
org.apache.spark.rpc.RpcEndpointRef.askWithRetry(RpcEndpointRef.scala:77)
at 
org.apache.spark.MapOutputTracker.askTracker(MapOutputTracker.scala:98)
at 
org.apache.spark.MapOutputTracker.getMapSizesByExecutorId(MapOutputTracker.scala:156)
at 
org.apache.spark.shuffle.hash.HashShuffleReader.read(HashShuffleReader.scala:47)
at 
org.apache.spark.sql.execution.ShuffledRowRDD.compute(ShuffledRowRDD.scala:71)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:301)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:265)
at 
org.apache.spark.rdd.MapPartitionsWithPreparationRDD.compute(MapPartitionsWithPreparationRDD.scala:63)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:301)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:265)
at 
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:301)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:265)
at 
org.apache.spark.rdd.MapPartitionsWithPreparationRDD.compute(MapPartitionsWithPreparationRDD.scala:63)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:301)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:265)
at 
org.apache.spark.rdd.ZippedPartitionsRDD2.compute(ZippedPartitionsRDD.scala:99)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:301)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:265)
at 
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:301)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:265)
at 
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:301)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:265)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:68)
at org.apache.spark.scheduler.Task.run(Task.scala:90)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:229)
at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.util.concurrent.TimeoutException: Futures timed out after [120 
seconds]
at scala.concurrent.impl.Promise$DefaultPromise.ready(Promise.scala:219)
at 
scala.concurrent.impl.Promise$DefaultPromise.result(Promise.scala:223)
at scala.concurrent.Await$$anonfun$result$1.apply(package.scala:107)
at 
scala.concurrent.BlockContext$DefaultBlockContext$.blockOn(BlockContext.scala:53)
at scala.concurrent.Await$.result(package.scala:107)
at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
... 32 more
2016-03-22 11:48:52,044 | INFO  | [Executor task launch worker-1] | Got the 
output locations | org.apache.spark.Logging$class.logInfo(Logging.scala:59)
{code}

Driver log shows serialize mapStatus is serial
{code:borderStyle=solid}
Line 192882: 16/03/22 11:46:54 INFO [dispatcher-event-loop-25] 
MapOutputTrackerMaster: Size of output statuses for shuffle 1 is 10298063 bytes
Line 192914: 16/03/22 

[jira] [Updated] (SPARK-14065) serialize MapStatuses in serial model

2016-03-22 Thread xukun (JIRA)

 [ 
https://issues.apache.org/jira/browse/SPARK-14065?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

xukun updated SPARK-14065:
--
Description: 
 the query like this:
{quote}
use tpcds_parquet_30720; 
drop table temp; 
create table temp as select ws_order_number, sum1, sum2, sum1 * sum2 
from (select ws_order_number, count(1) as sum1 from web_sales group by 
ws_order_number) x 
join (select wr_order_number, count(1) as sum2 from web_returns group by 
wr_order_number) y 
on x.ws_order_number = y.wr_order_number;
{quote}

Executor log shows GetMapOutputStatuses timeout:
{code:borderStyle=solid}
2016-03-22 11:48:48,388 | WARN  | [Executor task launch worker-1] | Error 
sending message [message = GetMapOutputStatuses(1)] in 1 attempts | 
org.apache.spark.Logging$class.logWarning(Logging.scala:92)
org.apache.spark.rpc.RpcTimeoutException: Futures timed out after [120 
seconds]. This timeout is controlled by spark.network.timeout
at 
org.apache.spark.rpc.RpcTimeout.org$apache$spark$rpc$RpcTimeout$$createRpcTimeoutException(RpcTimeout.scala:48)
at 
org.apache.spark.rpc.RpcTimeout$$anonfun$addMessageIfTimeout$1.applyOrElse(RpcTimeout.scala:63)
at 
org.apache.spark.rpc.RpcTimeout$$anonfun$addMessageIfTimeout$1.applyOrElse(RpcTimeout.scala:59)
at 
scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:33)
at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:76)
at 
org.apache.spark.rpc.RpcEndpointRef.askWithRetry(RpcEndpointRef.scala:101)
at 
org.apache.spark.rpc.RpcEndpointRef.askWithRetry(RpcEndpointRef.scala:77)
at 
org.apache.spark.MapOutputTracker.askTracker(MapOutputTracker.scala:98)
at 
org.apache.spark.MapOutputTracker.getMapSizesByExecutorId(MapOutputTracker.scala:156)
at 
org.apache.spark.shuffle.hash.HashShuffleReader.read(HashShuffleReader.scala:47)
at 
org.apache.spark.sql.execution.ShuffledRowRDD.compute(ShuffledRowRDD.scala:71)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:301)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:265)
at 
org.apache.spark.rdd.MapPartitionsWithPreparationRDD.compute(MapPartitionsWithPreparationRDD.scala:63)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:301)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:265)
at 
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:301)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:265)
at 
org.apache.spark.rdd.MapPartitionsWithPreparationRDD.compute(MapPartitionsWithPreparationRDD.scala:63)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:301)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:265)
at 
org.apache.spark.rdd.ZippedPartitionsRDD2.compute(ZippedPartitionsRDD.scala:99)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:301)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:265)
at 
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:301)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:265)
at 
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:301)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:265)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:68)
at org.apache.spark.scheduler.Task.run(Task.scala:90)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:229)
at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.util.concurrent.TimeoutException: Futures timed out after [120 
seconds]
at scala.concurrent.impl.Promise$DefaultPromise.ready(Promise.scala:219)
at 
scala.concurrent.impl.Promise$DefaultPromise.result(Promise.scala:223)
at scala.concurrent.Await$$anonfun$result$1.apply(package.scala:107)
at 
scala.concurrent.BlockContext$DefaultBlockContext$.blockOn(BlockContext.scala:53)
at scala.concurrent.Await$.result(package.scala:107)
at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
... 32 more
2016-03-22 11:48:52,044 | INFO  | [Executor task launch worker-1] | Got the 
output locations | org.apache.spark.Logging$class.logInfo(Logging.scala:59)
{code}

Driver log shows serialize mapStatus is serial
{code:borderStyle=solid}
Line 192882: 16/03/22 11:46:54 INFO [dispatcher-event-loop-25] 
MapOutputTrackerMaster: Size of output statuses for shuffle 1 is 10298063 bytes
Line 192914: 

[jira] [Updated] (SPARK-14065) serialize MapStatuses in serial model

2016-03-22 Thread xukun (JIRA)

 [ 
https://issues.apache.org/jira/browse/SPARK-14065?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

xukun updated SPARK-14065:
--
Description: 
 the query like this:
use tpcds_parquet_30720; 
drop table temp; 
create table temp as select ws_order_number, sum1, sum2, sum1 * sum2 
from (select ws_order_number, count(1) as sum1 from web_sales group by 
ws_order_number) x 
join (select wr_order_number, count(1) as sum2 from web_returns group by 
wr_order_number) y 
on x.ws_order_number = y.wr_order_number;

Executor log shows GetMapOutputStatuses timeout:
{code:borderStyle=solid}
2016-03-22 11:48:48,388 | WARN  | [Executor task launch worker-1] | Error 
sending message [message = GetMapOutputStatuses(1)] in 1 attempts | 
org.apache.spark.Logging$class.logWarning(Logging.scala:92)
org.apache.spark.rpc.RpcTimeoutException: Futures timed out after [120 
seconds]. This timeout is controlled by spark.network.timeout
at 
org.apache.spark.rpc.RpcTimeout.org$apache$spark$rpc$RpcTimeout$$createRpcTimeoutException(RpcTimeout.scala:48)
at 
org.apache.spark.rpc.RpcTimeout$$anonfun$addMessageIfTimeout$1.applyOrElse(RpcTimeout.scala:63)
at 
org.apache.spark.rpc.RpcTimeout$$anonfun$addMessageIfTimeout$1.applyOrElse(RpcTimeout.scala:59)
at 
scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:33)
at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:76)
at 
org.apache.spark.rpc.RpcEndpointRef.askWithRetry(RpcEndpointRef.scala:101)
at 
org.apache.spark.rpc.RpcEndpointRef.askWithRetry(RpcEndpointRef.scala:77)
at 
org.apache.spark.MapOutputTracker.askTracker(MapOutputTracker.scala:98)
at 
org.apache.spark.MapOutputTracker.getMapSizesByExecutorId(MapOutputTracker.scala:156)
at 
org.apache.spark.shuffle.hash.HashShuffleReader.read(HashShuffleReader.scala:47)
at 
org.apache.spark.sql.execution.ShuffledRowRDD.compute(ShuffledRowRDD.scala:71)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:301)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:265)
at 
org.apache.spark.rdd.MapPartitionsWithPreparationRDD.compute(MapPartitionsWithPreparationRDD.scala:63)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:301)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:265)
at 
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:301)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:265)
at 
org.apache.spark.rdd.MapPartitionsWithPreparationRDD.compute(MapPartitionsWithPreparationRDD.scala:63)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:301)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:265)
at 
org.apache.spark.rdd.ZippedPartitionsRDD2.compute(ZippedPartitionsRDD.scala:99)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:301)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:265)
at 
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:301)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:265)
at 
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:301)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:265)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:68)
at org.apache.spark.scheduler.Task.run(Task.scala:90)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:229)
at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.util.concurrent.TimeoutException: Futures timed out after [120 
seconds]
at scala.concurrent.impl.Promise$DefaultPromise.ready(Promise.scala:219)
at 
scala.concurrent.impl.Promise$DefaultPromise.result(Promise.scala:223)
at scala.concurrent.Await$$anonfun$result$1.apply(package.scala:107)
at 
scala.concurrent.BlockContext$DefaultBlockContext$.blockOn(BlockContext.scala:53)
at scala.concurrent.Await$.result(package.scala:107)
at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
... 32 more
2016-03-22 11:48:52,044 | INFO  | [Executor task launch worker-1] | Got the 
output locations | org.apache.spark.Logging$class.logInfo(Logging.scala:59)
{code}

Driver log shows serialize mapStatus is serial
{code:borderStyle=solid}
Line 192882: 16/03/22 11:46:54 INFO [dispatcher-event-loop-25] 
MapOutputTrackerMaster: Size of output statuses for shuffle 1 is 10298063 bytes
Line 192914: 16/03/22 11:47:01 

[jira] [Updated] (SPARK-14065) serialize MapStatuses in serial model

2016-03-22 Thread xukun (JIRA)

 [ 
https://issues.apache.org/jira/browse/SPARK-14065?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

xukun updated SPARK-14065:
--
Description: 
 the query like this:
{code:title=Bar.java|borderStyle=solid}
use tpcds_parquet_30720; 
drop table temp; 
create table temp as select ws_order_number, sum1, sum2, sum1 * sum2 
from (select ws_order_number, count(1) as sum1 from web_sales group by 
ws_order_number) x 
join (select wr_order_number, count(1) as sum2 from web_returns group by 
wr_order_number) y 
on x.ws_order_number = y.wr_order_number;
{code}

Executor log shows GetMapOutputStatuses timeout:
```
2016-03-22 11:48:48,388 | WARN  | [Executor task launch worker-1] | Error 
sending message [message = GetMapOutputStatuses(1)] in 1 attempts | 
org.apache.spark.Logging$class.logWarning(Logging.scala:92)
org.apache.spark.rpc.RpcTimeoutException: Futures timed out after [120 
seconds]. This timeout is controlled by spark.network.timeout
at 
org.apache.spark.rpc.RpcTimeout.org$apache$spark$rpc$RpcTimeout$$createRpcTimeoutException(RpcTimeout.scala:48)
at 
org.apache.spark.rpc.RpcTimeout$$anonfun$addMessageIfTimeout$1.applyOrElse(RpcTimeout.scala:63)
at 
org.apache.spark.rpc.RpcTimeout$$anonfun$addMessageIfTimeout$1.applyOrElse(RpcTimeout.scala:59)
at 
scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:33)
at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:76)
at 
org.apache.spark.rpc.RpcEndpointRef.askWithRetry(RpcEndpointRef.scala:101)
at 
org.apache.spark.rpc.RpcEndpointRef.askWithRetry(RpcEndpointRef.scala:77)
at 
org.apache.spark.MapOutputTracker.askTracker(MapOutputTracker.scala:98)
at 
org.apache.spark.MapOutputTracker.getMapSizesByExecutorId(MapOutputTracker.scala:156)
at 
org.apache.spark.shuffle.hash.HashShuffleReader.read(HashShuffleReader.scala:47)
at 
org.apache.spark.sql.execution.ShuffledRowRDD.compute(ShuffledRowRDD.scala:71)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:301)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:265)
at 
org.apache.spark.rdd.MapPartitionsWithPreparationRDD.compute(MapPartitionsWithPreparationRDD.scala:63)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:301)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:265)
at 
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:301)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:265)
at 
org.apache.spark.rdd.MapPartitionsWithPreparationRDD.compute(MapPartitionsWithPreparationRDD.scala:63)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:301)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:265)
at 
org.apache.spark.rdd.ZippedPartitionsRDD2.compute(ZippedPartitionsRDD.scala:99)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:301)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:265)
at 
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:301)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:265)
at 
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:301)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:265)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:68)
at org.apache.spark.scheduler.Task.run(Task.scala:90)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:229)
at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.util.concurrent.TimeoutException: Futures timed out after [120 
seconds]
at scala.concurrent.impl.Promise$DefaultPromise.ready(Promise.scala:219)
at 
scala.concurrent.impl.Promise$DefaultPromise.result(Promise.scala:223)
at scala.concurrent.Await$$anonfun$result$1.apply(package.scala:107)
at 
scala.concurrent.BlockContext$DefaultBlockContext$.blockOn(BlockContext.scala:53)
at scala.concurrent.Await$.result(package.scala:107)
at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
... 32 more
2016-03-22 11:48:52,044 | INFO  | [Executor task launch worker-1] | Got the 
output locations | org.apache.spark.Logging$class.logInfo(Logging.scala:59)
```

Driver log shows serialize mapStatus is serial
```
Line 192882: 16/03/22 11:46:54 INFO [dispatcher-event-loop-25] 
MapOutputTrackerMaster: Size of output statuses for shuffle 1 is 10298063 bytes
Line 192914: 16/03/22 11:47:01 

[jira] [Updated] (SPARK-14065) serialize MapStatuses in serial model

2016-03-22 Thread xukun (JIRA)

 [ 
https://issues.apache.org/jira/browse/SPARK-14065?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]

xukun updated SPARK-14065:
--
Description: 
 the query like this:
```shell
use tpcds_parquet_30720; 
drop table temp; 
create table temp as select ws_order_number, sum1, sum2, sum1 * sum2 
from (select ws_order_number, count(1) as sum1 from web_sales group by 
ws_order_number) x 
join (select wr_order_number, count(1) as sum2 from web_returns group by 
wr_order_number) y 
on x.ws_order_number = y.wr_order_number;
```

Executor log shows GetMapOutputStatuses timeout:
```
2016-03-22 11:48:48,388 | WARN  | [Executor task launch worker-1] | Error 
sending message [message = GetMapOutputStatuses(1)] in 1 attempts | 
org.apache.spark.Logging$class.logWarning(Logging.scala:92)
org.apache.spark.rpc.RpcTimeoutException: Futures timed out after [120 
seconds]. This timeout is controlled by spark.network.timeout
at 
org.apache.spark.rpc.RpcTimeout.org$apache$spark$rpc$RpcTimeout$$createRpcTimeoutException(RpcTimeout.scala:48)
at 
org.apache.spark.rpc.RpcTimeout$$anonfun$addMessageIfTimeout$1.applyOrElse(RpcTimeout.scala:63)
at 
org.apache.spark.rpc.RpcTimeout$$anonfun$addMessageIfTimeout$1.applyOrElse(RpcTimeout.scala:59)
at 
scala.runtime.AbstractPartialFunction.apply(AbstractPartialFunction.scala:33)
at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:76)
at 
org.apache.spark.rpc.RpcEndpointRef.askWithRetry(RpcEndpointRef.scala:101)
at 
org.apache.spark.rpc.RpcEndpointRef.askWithRetry(RpcEndpointRef.scala:77)
at 
org.apache.spark.MapOutputTracker.askTracker(MapOutputTracker.scala:98)
at 
org.apache.spark.MapOutputTracker.getMapSizesByExecutorId(MapOutputTracker.scala:156)
at 
org.apache.spark.shuffle.hash.HashShuffleReader.read(HashShuffleReader.scala:47)
at 
org.apache.spark.sql.execution.ShuffledRowRDD.compute(ShuffledRowRDD.scala:71)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:301)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:265)
at 
org.apache.spark.rdd.MapPartitionsWithPreparationRDD.compute(MapPartitionsWithPreparationRDD.scala:63)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:301)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:265)
at 
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:301)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:265)
at 
org.apache.spark.rdd.MapPartitionsWithPreparationRDD.compute(MapPartitionsWithPreparationRDD.scala:63)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:301)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:265)
at 
org.apache.spark.rdd.ZippedPartitionsRDD2.compute(ZippedPartitionsRDD.scala:99)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:301)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:265)
at 
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:301)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:265)
at 
org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:301)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:265)
at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:68)
at org.apache.spark.scheduler.Task.run(Task.scala:90)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:229)
at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
Caused by: java.util.concurrent.TimeoutException: Futures timed out after [120 
seconds]
at scala.concurrent.impl.Promise$DefaultPromise.ready(Promise.scala:219)
at 
scala.concurrent.impl.Promise$DefaultPromise.result(Promise.scala:223)
at scala.concurrent.Await$$anonfun$result$1.apply(package.scala:107)
at 
scala.concurrent.BlockContext$DefaultBlockContext$.blockOn(BlockContext.scala:53)
at scala.concurrent.Await$.result(package.scala:107)
at org.apache.spark.rpc.RpcTimeout.awaitResult(RpcTimeout.scala:75)
... 32 more
2016-03-22 11:48:52,044 | INFO  | [Executor task launch worker-1] | Got the 
output locations | org.apache.spark.Logging$class.logInfo(Logging.scala:59)
```

Driver log shows serialize mapStatus is serial
```
Line 192882: 16/03/22 11:46:54 INFO [dispatcher-event-loop-25] 
MapOutputTrackerMaster: Size of output statuses for shuffle 1 is 10298063 bytes
Line 192914: 16/03/22 11:47:01 INFO [dispatcher-event-loop-7]