[jira] [Updated] (SPARK-19273) Stage is not retay when shuffle file is lost

xukun (JIRA) Wed, 18 Jan 2017 01:05:32 -0800

     [ 
https://issues.apache.org/jira/browse/SPARK-19273?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
 ]


xukun updated SPARK-19273:
--------------------------
    Affects Version/s: 2.1.0
          Description: 
Execute "insert into table select * from a join b on a.xx = b.xx". In Shuffle 
stage ,we delete shuffle file, shuffle stage will not retry and job fail 
because task fail 4 times.

detail:
create two external table use tpcds:
create external table date_dim
(
    d_date_sk                 int,
    d_date_id                 string,
    d_date                    string,
    d_month_seq               int,
    d_week_seq                int,
    d_quarter_seq             int,
    d_year                    int,
    d_dow                     int,
    d_moy                     int,
    d_dom                     int,
    d_qoy                     int,
    d_fy_year                 int,
    d_fy_quarter_seq          int,
    d_fy_week_seq             int,
    d_day_name                string,
    d_quarter_name            string,
    d_holiday                 string,
    d_weekend                 string,
    d_following_holiday       string,
    d_first_dom               int,
    d_last_dom                int,
    d_same_day_ly             int,
    d_same_day_lq             int,
    d_current_day             string,
    d_current_week            string,
    d_current_month           string,
    d_current_quarter         string,
    d_current_year            string
)
row format delimited fields terminated by '|'
location 'path1';
create external table web_sales
(
    ws_sold_date_sk           int,
    ws_sold_time_sk           int,
    ws_ship_date_sk           int,
    ws_item_sk                int,
    ws_bill_customer_sk       int,
    ws_bill_cdemo_sk          int,
    ws_bill_hdemo_sk          int,
    ws_bill_addr_sk           int,
    ws_ship_customer_sk       int,
    ws_ship_cdemo_sk          int,
    ws_ship_hdemo_sk          int,
    ws_ship_addr_sk           int,
    ws_web_page_sk            int,
    ws_web_site_sk            int,
    ws_ship_mode_sk           int,
    ws_warehouse_sk           int,
    ws_promo_sk               int,
    ws_order_number           int,
    ws_quantity               int,
    ws_wholesale_cost         float,
    ws_list_price             float,
    ws_sales_price            float,
    ws_ext_discount_amt       float,
    ws_ext_sales_price        float,
    ws_ext_wholesale_cost     float,
    ws_ext_list_price         float,
    ws_ext_tax                float,
    ws_coupon_amt             float,
    ws_ext_ship_cost          float,
    ws_net_paid               float,
    ws_net_paid_inc_tax       float,
    ws_net_paid_inc_ship      float,
    ws_net_paid_inc_ship_tax  float,
    ws_net_profit             float
)
row format delimited fields terminated by '|'
location 'path2';

then execute sql like this:
create table web_sales1
(
    ws_sold_date_sk           int,
    ws_sold_time_sk           int,
    ws_ship_date_sk           int,
    ws_item_sk                int,
    ws_bill_customer_sk       int,
    ws_bill_cdemo_sk          int,
    ws_bill_hdemo_sk          int,
    ws_bill_addr_sk           int,
    ws_ship_customer_sk       int,
    ws_ship_cdemo_sk          int,
    ws_ship_hdemo_sk          int,
    ws_ship_addr_sk           int,
    ws_web_page_sk            int,
    ws_web_site_sk            int,
    ws_ship_mode_sk           int,
    ws_warehouse_sk           int,
    ws_promo_sk               int,
    ws_order_number           int,
    ws_quantity               int,
    ws_wholesale_cost         float,
    ws_list_price             float,
    ws_sales_price            float,
    ws_ext_discount_amt       float,
    ws_ext_sales_price        float,
    ws_ext_wholesale_cost     float,
    ws_ext_list_price         float,
    ws_ext_tax                float,
    ws_coupon_amt             float,
    ws_ext_ship_cost          float,
    ws_net_paid               float,
    ws_net_paid_inc_tax       float,
    ws_net_paid_inc_ship      float,
    ws_net_paid_inc_ship_tax  float,
    ws_net_profit             float
)
partitioned by (ws_sold_date string)
stored as parquet;

set hive.exec.dynamic.partition=true;
set hive.exec.dynamic.partition.mode=nonstrict;
set hive.exec.max.dynamic.partitions=100;
set spark.sql.autoBroadcastJoinThreshold = 1;
insert overwrite table web_sales1 partition (ws_sold_date)
select
        ws.ws_sold_date_sk,
        ws.ws_sold_time_sk,
        ws.ws_ship_date_sk,
        ws.ws_item_sk,
        ws.ws_bill_customer_sk,
        ws.ws_bill_cdemo_sk,
        ws.ws_bill_hdemo_sk,
        ws.ws_bill_addr_sk,
        ws.ws_ship_customer_sk,
        ws.ws_ship_cdemo_sk,
        ws.ws_ship_hdemo_sk,
        ws.ws_ship_addr_sk,
        ws.ws_web_page_sk,
        ws.ws_web_site_sk,
        ws.ws_ship_mode_sk,
        ws.ws_warehouse_sk,
        ws.ws_promo_sk,
        ws.ws_order_number,
        ws.ws_quantity,
        ws.ws_wholesale_cost,
        ws.ws_list_price,
        ws.ws_sales_price,
        ws.ws_ext_discount_amt,
        ws.ws_ext_sales_price,
        ws.ws_ext_wholesale_cost,
        ws.ws_ext_list_price,
        ws.ws_ext_tax,
        ws.ws_coupon_amt,
        ws.ws_ext_ship_cost,
        ws.ws_net_paid,
        ws.ws_net_paid_inc_tax,
        ws.ws_net_paid_inc_ship,
        ws.ws_net_paid_inc_ship_tax,
        ws.ws_net_profit,
        dd.d_date as ws_sold_date
      from tpcds_text.web_sales ws
      join tpcds_text.date_dim dd
      on (ws.ws_sold_date_sk = dd.d_date_sk);
after map stage, delete executor shuffle file, the job will fail. log is:
```
17/01/18 16:28:49 INFO TaskSetManager: Starting task 37.1 in stage 8.0 (TID 52, 
xk3, executor 6, partition 37, NODE_LOCAL, 7416 bytes)
17/01/18 16:28:49 INFO TaskSetManager: Lost task 38.0 in stage 8.0 (TID 49) on 
xk3, executor 6: org.apache.spark.SparkException (Task failed while writing 
rows.) [duplicate 1]
17/01/18 16:28:49 INFO TaskSetManager: Starting task 38.1 in stage 8.0 (TID 53, 
xk3, executor 2, partition 38, NODE_LOCAL, 7416 bytes)
17/01/18 16:28:49 INFO TaskSetManager: Lost task 39.0 in stage 8.0 (TID 50) on 
xk3, executor 2: org.apache.spark.SparkException (Task failed while writing 
rows.) [duplicate 2]
17/01/18 16:28:49 INFO TaskSetManager: Starting task 39.1 in stage 8.0 (TID 54, 
xk3, executor 3, partition 39, NODE_LOCAL, 7416 bytes)
17/01/18 16:28:49 INFO TaskSetManager: Lost task 40.0 in stage 8.0 (TID 51) on 
xk3, executor 3: org.apache.spark.SparkException (Task failed while writing 
rows.) [duplicate 3]
17/01/18 16:28:49 INFO TaskSetManager: Starting task 40.1 in stage 8.0 (TID 55, 
xk3, executor 6, partition 40, NODE_LOCAL, 7416 bytes)
17/01/18 16:28:49 INFO TaskSetManager: Lost task 37.1 in stage 8.0 (TID 52) on 
xk3, executor 6: org.apache.spark.SparkException (Task failed while writing 
rows.) [duplicate 4]
17/01/18 16:28:49 INFO TaskSetManager: Starting task 37.2 in stage 8.0 (TID 56, 
xk2, executor 4, partition 37, NODE_LOCAL, 7416 bytes)
17/01/18 16:28:49 INFO TaskSetManager: Finished task 34.0 in stage 8.0 (TID 45) 
in 1209 ms on xk2 (executor 4) (35/200)
17/01/18 16:28:49 INFO TaskSetManager: Starting task 41.0 in stage 8.0 (TID 57, 
xk3, executor 3, partition 41, NODE_LOCAL, 7416 bytes)
17/01/18 16:28:49 INFO TaskSetManager: Lost task 39.1 in stage 8.0 (TID 54) on 
xk3, executor 3: org.apache.spark.SparkException (Task failed while writing 
rows.) [duplicate 5]
17/01/18 16:28:49 INFO TaskSetManager: Starting task 39.2 in stage 8.0 (TID 58, 
xk3, executor 2, partition 39, NODE_LOCAL, 7416 bytes)
17/01/18 16:28:49 INFO TaskSetManager: Lost task 38.1 in stage 8.0 (TID 53) on 
xk3, executor 2: org.apache.spark.SparkException (Task failed while writing 
rows.) [duplicate 6]
17/01/18 16:28:49 INFO TaskSetManager: Starting task 38.2 in stage 8.0 (TID 59, 
xk2, executor 1, partition 38, NODE_LOCAL, 7416 bytes)
17/01/18 16:28:49 INFO TaskSetManager: Finished task 30.0 in stage 8.0 (TID 41) 
in 2224 ms on xk2 (executor 1) (36/200)
17/01/18 16:28:49 INFO TaskSetManager: Starting task 42.0 in stage 8.0 (TID 60, 
xk2, executor 4, partition 42, NODE_LOCAL, 7416 bytes)
17/01/18 16:28:49 INFO TaskSetManager: Lost task 37.2 in stage 8.0 (TID 56) on 
xk2, executor 4: org.apache.spark.SparkException (Task failed while writing 
rows.) [duplicate 7]
17/01/18 16:28:49 INFO TaskSetManager: Starting task 37.3 in stage 8.0 (TID 61, 
xk3, executor 3, partition 37, NODE_LOCAL, 7416 bytes)
17/01/18 16:28:49 INFO TaskSetManager: Lost task 41.0 in stage 8.0 (TID 57) on 
xk3, executor 3: org.apache.spark.SparkException (Task failed while writing 
rows.) [duplicate 8]
17/01/18 16:28:49 INFO TaskSetManager: Starting task 41.1 in stage 8.0 (TID 62, 
xk3, executor 6, partition 41, NODE_LOCAL, 7416 bytes)
17/01/18 16:28:49 INFO TaskSetManager: Lost task 40.1 in stage 8.0 (TID 55) on 
xk3, executor 6: org.apache.spark.SparkException (Task failed while writing 
rows.) [duplicate 9]
17/01/18 16:28:49 INFO TaskSetManager: Starting task 40.2 in stage 8.0 (TID 63, 
xk2, executor 4, partition 40, NODE_LOCAL, 7416 bytes)
17/01/18 16:28:49 INFO TaskSetManager: Lost task 42.0 in stage 8.0 (TID 60) on 
xk2, executor 4: org.apache.spark.SparkException (Task failed while writing 
rows.) [duplicate 10]
17/01/18 16:28:50 INFO TaskSetManager: Starting task 42.1 in stage 8.0 (TID 64, 
xk2, executor 1, partition 42, NODE_LOCAL, 7416 bytes)
17/01/18 16:28:50 INFO TaskSetManager: Lost task 38.2 in stage 8.0 (TID 59) on 
xk2, executor 1: org.apache.spark.SparkException (Task failed while writing 
rows.) [duplicate 11]
17/01/18 16:28:50 INFO TaskSetManager: Starting task 38.3 in stage 8.0 (TID 65, 
xk3, executor 6, partition 38, NODE_LOCAL, 7416 bytes)
17/01/18 16:28:50 INFO TaskSetManager: Lost task 41.1 in stage 8.0 (TID 62) on 
xk3, executor 6: org.apache.spark.SparkException (Task failed while writing 
rows.) [duplicate 12]
17/01/18 16:28:50 INFO TaskSetManager: Starting task 41.2 in stage 8.0 (TID 66, 
xk3, executor 2, partition 41, NODE_LOCAL, 7416 bytes)
17/01/18 16:28:50 INFO TaskSetManager: Lost task 39.2 in stage 8.0 (TID 58) on 
xk3, executor 2: org.apache.spark.SparkException (Task failed while writing 
rows.) [duplicate 13]
17/01/18 16:28:50 INFO TaskSetManager: Starting task 39.3 in stage 8.0 (TID 67, 
xk3, executor 3, partition 39, NODE_LOCAL, 7416 bytes)
17/01/18 16:28:50 INFO TaskSetManager: Lost task 37.3 in stage 8.0 (TID 61) on 
xk3, executor 3: org.apache.spark.SparkException (Task failed while writing 
rows.) [duplicate 14]
17/01/18 16:28:50 ERROR TaskSetManager: Task 37 in stage 8.0 failed 4 times; 
aborting job
17/01/18 16:28:50 INFO YarnScheduler: Cancelling stage 8
17/01/18 16:28:50 INFO YarnScheduler: Stage 8 was cancelled
17/01/18 16:28:50 INFO DAGScheduler: ResultStage 8 (processCmd at 
CliDriver.java:377) failed in 13.292 s due to Job aborted due to stage failure: 
Task 37 in stage 8.0 failed 4 times, most recent failure: Lost task 37.3 in 
stage 8.0 (TID 61, xk3, executor 3): org.apache.spark.SparkException: Task 
failed while writing rows.
        at 
org.apache.spark.sql.hive.SparkHiveDynamicPartitionWriterContainer.writeToFile(hiveWriterContainers.scala:328)
        at 
org.apache.spark.sql.hive.execution.InsertIntoHiveTable$$anonfun$saveAsHiveFile$3.apply(InsertIntoHiveTable.scala:210)
        at 
org.apache.spark.sql.hive.execution.InsertIntoHiveTable$$anonfun$saveAsHiveFile$3.apply(InsertIntoHiveTable.scala:210)
        at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
        at org.apache.spark.scheduler.Task.run(Task.scala:99)
        at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322)
        at 
java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
        at 
java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
        at java.lang.Thread.run(Thread.java:745)
Caused by: org.apache.spark.shuffle.FetchFailedException: 
java.io.FileNotFoundException: 
/srv/BigData/hadoop/data1/nm/localdir/usercache/super/appcache/application_1484570747988_0012/blockmgr-3c7cc8f2-a11e-4fd4-b671-b8c7db6132ca/32/shuffle_1_1_0.index
 (No such file or directory)
```



          Component/s: SQL
              Summary: Stage is not retay when shuffle file is lost   (was: 
Stage is not retay when )

> Stage is not retay when shuffle file is lost 
> ---------------------------------------------
>
>                 Key: SPARK-19273
>                 URL: https://issues.apache.org/jira/browse/SPARK-19273
>             Project: Spark
>          Issue Type: Bug
>          Components: SQL
>    Affects Versions: 2.1.0
>            Reporter: xukun
>
> Execute "insert into table select * from a join b on a.xx = b.xx". In Shuffle 
> stage ,we delete shuffle file, shuffle stage will not retry and job fail 
> because task fail 4 times.
> detail:
> create two external table use tpcds:
> create external table date_dim
> (
>     d_date_sk                 int,
>     d_date_id                 string,
>     d_date                    string,
>     d_month_seq               int,
>     d_week_seq                int,
>     d_quarter_seq             int,
>     d_year                    int,
>     d_dow                     int,
>     d_moy                     int,
>     d_dom                     int,
>     d_qoy                     int,
>     d_fy_year                 int,
>     d_fy_quarter_seq          int,
>     d_fy_week_seq             int,
>     d_day_name                string,
>     d_quarter_name            string,
>     d_holiday                 string,
>     d_weekend                 string,
>     d_following_holiday       string,
>     d_first_dom               int,
>     d_last_dom                int,
>     d_same_day_ly             int,
>     d_same_day_lq             int,
>     d_current_day             string,
>     d_current_week            string,
>     d_current_month           string,
>     d_current_quarter         string,
>     d_current_year            string
> )
> row format delimited fields terminated by '|'
> location 'path1';
> create external table web_sales
> (
>     ws_sold_date_sk           int,
>     ws_sold_time_sk           int,
>     ws_ship_date_sk           int,
>     ws_item_sk                int,
>     ws_bill_customer_sk       int,
>     ws_bill_cdemo_sk          int,
>     ws_bill_hdemo_sk          int,
>     ws_bill_addr_sk           int,
>     ws_ship_customer_sk       int,
>     ws_ship_cdemo_sk          int,
>     ws_ship_hdemo_sk          int,
>     ws_ship_addr_sk           int,
>     ws_web_page_sk            int,
>     ws_web_site_sk            int,
>     ws_ship_mode_sk           int,
>     ws_warehouse_sk           int,
>     ws_promo_sk               int,
>     ws_order_number           int,
>     ws_quantity               int,
>     ws_wholesale_cost         float,
>     ws_list_price             float,
>     ws_sales_price            float,
>     ws_ext_discount_amt       float,
>     ws_ext_sales_price        float,
>     ws_ext_wholesale_cost     float,
>     ws_ext_list_price         float,
>     ws_ext_tax                float,
>     ws_coupon_amt             float,
>     ws_ext_ship_cost          float,
>     ws_net_paid               float,
>     ws_net_paid_inc_tax       float,
>     ws_net_paid_inc_ship      float,
>     ws_net_paid_inc_ship_tax  float,
>     ws_net_profit             float
> )
> row format delimited fields terminated by '|'
> location 'path2';
> then execute sql like this:
> create table web_sales1
> (
>     ws_sold_date_sk           int,
>     ws_sold_time_sk           int,
>     ws_ship_date_sk           int,
>     ws_item_sk                int,
>     ws_bill_customer_sk       int,
>     ws_bill_cdemo_sk          int,
>     ws_bill_hdemo_sk          int,
>     ws_bill_addr_sk           int,
>     ws_ship_customer_sk       int,
>     ws_ship_cdemo_sk          int,
>     ws_ship_hdemo_sk          int,
>     ws_ship_addr_sk           int,
>     ws_web_page_sk            int,
>     ws_web_site_sk            int,
>     ws_ship_mode_sk           int,
>     ws_warehouse_sk           int,
>     ws_promo_sk               int,
>     ws_order_number           int,
>     ws_quantity               int,
>     ws_wholesale_cost         float,
>     ws_list_price             float,
>     ws_sales_price            float,
>     ws_ext_discount_amt       float,
>     ws_ext_sales_price        float,
>     ws_ext_wholesale_cost     float,
>     ws_ext_list_price         float,
>     ws_ext_tax                float,
>     ws_coupon_amt             float,
>     ws_ext_ship_cost          float,
>     ws_net_paid               float,
>     ws_net_paid_inc_tax       float,
>     ws_net_paid_inc_ship      float,
>     ws_net_paid_inc_ship_tax  float,
>     ws_net_profit             float
> )
> partitioned by (ws_sold_date string)
> stored as parquet;
> set hive.exec.dynamic.partition=true;
> set hive.exec.dynamic.partition.mode=nonstrict;
> set hive.exec.max.dynamic.partitions=100;
> set spark.sql.autoBroadcastJoinThreshold = 1;
> insert overwrite table web_sales1 partition (ws_sold_date)
> select
>         ws.ws_sold_date_sk,
>         ws.ws_sold_time_sk,
>         ws.ws_ship_date_sk,
>         ws.ws_item_sk,
>         ws.ws_bill_customer_sk,
>         ws.ws_bill_cdemo_sk,
>         ws.ws_bill_hdemo_sk,
>         ws.ws_bill_addr_sk,
>         ws.ws_ship_customer_sk,
>         ws.ws_ship_cdemo_sk,
>         ws.ws_ship_hdemo_sk,
>         ws.ws_ship_addr_sk,
>         ws.ws_web_page_sk,
>         ws.ws_web_site_sk,
>         ws.ws_ship_mode_sk,
>         ws.ws_warehouse_sk,
>         ws.ws_promo_sk,
>         ws.ws_order_number,
>         ws.ws_quantity,
>         ws.ws_wholesale_cost,
>         ws.ws_list_price,
>         ws.ws_sales_price,
>         ws.ws_ext_discount_amt,
>         ws.ws_ext_sales_price,
>         ws.ws_ext_wholesale_cost,
>         ws.ws_ext_list_price,
>         ws.ws_ext_tax,
>         ws.ws_coupon_amt,
>         ws.ws_ext_ship_cost,
>         ws.ws_net_paid,
>         ws.ws_net_paid_inc_tax,
>         ws.ws_net_paid_inc_ship,
>         ws.ws_net_paid_inc_ship_tax,
>         ws.ws_net_profit,
>         dd.d_date as ws_sold_date
>       from tpcds_text.web_sales ws
>       join tpcds_text.date_dim dd
>       on (ws.ws_sold_date_sk = dd.d_date_sk);
> after map stage, delete executor shuffle file, the job will fail. log is:
> ```
> 17/01/18 16:28:49 INFO TaskSetManager: Starting task 37.1 in stage 8.0 (TID 
> 52, xk3, executor 6, partition 37, NODE_LOCAL, 7416 bytes)
> 17/01/18 16:28:49 INFO TaskSetManager: Lost task 38.0 in stage 8.0 (TID 49) 
> on xk3, executor 6: org.apache.spark.SparkException (Task failed while 
> writing rows.) [duplicate 1]
> 17/01/18 16:28:49 INFO TaskSetManager: Starting task 38.1 in stage 8.0 (TID 
> 53, xk3, executor 2, partition 38, NODE_LOCAL, 7416 bytes)
> 17/01/18 16:28:49 INFO TaskSetManager: Lost task 39.0 in stage 8.0 (TID 50) 
> on xk3, executor 2: org.apache.spark.SparkException (Task failed while 
> writing rows.) [duplicate 2]
> 17/01/18 16:28:49 INFO TaskSetManager: Starting task 39.1 in stage 8.0 (TID 
> 54, xk3, executor 3, partition 39, NODE_LOCAL, 7416 bytes)
> 17/01/18 16:28:49 INFO TaskSetManager: Lost task 40.0 in stage 8.0 (TID 51) 
> on xk3, executor 3: org.apache.spark.SparkException (Task failed while 
> writing rows.) [duplicate 3]
> 17/01/18 16:28:49 INFO TaskSetManager: Starting task 40.1 in stage 8.0 (TID 
> 55, xk3, executor 6, partition 40, NODE_LOCAL, 7416 bytes)
> 17/01/18 16:28:49 INFO TaskSetManager: Lost task 37.1 in stage 8.0 (TID 52) 
> on xk3, executor 6: org.apache.spark.SparkException (Task failed while 
> writing rows.) [duplicate 4]
> 17/01/18 16:28:49 INFO TaskSetManager: Starting task 37.2 in stage 8.0 (TID 
> 56, xk2, executor 4, partition 37, NODE_LOCAL, 7416 bytes)
> 17/01/18 16:28:49 INFO TaskSetManager: Finished task 34.0 in stage 8.0 (TID 
> 45) in 1209 ms on xk2 (executor 4) (35/200)
> 17/01/18 16:28:49 INFO TaskSetManager: Starting task 41.0 in stage 8.0 (TID 
> 57, xk3, executor 3, partition 41, NODE_LOCAL, 7416 bytes)
> 17/01/18 16:28:49 INFO TaskSetManager: Lost task 39.1 in stage 8.0 (TID 54) 
> on xk3, executor 3: org.apache.spark.SparkException (Task failed while 
> writing rows.) [duplicate 5]
> 17/01/18 16:28:49 INFO TaskSetManager: Starting task 39.2 in stage 8.0 (TID 
> 58, xk3, executor 2, partition 39, NODE_LOCAL, 7416 bytes)
> 17/01/18 16:28:49 INFO TaskSetManager: Lost task 38.1 in stage 8.0 (TID 53) 
> on xk3, executor 2: org.apache.spark.SparkException (Task failed while 
> writing rows.) [duplicate 6]
> 17/01/18 16:28:49 INFO TaskSetManager: Starting task 38.2 in stage 8.0 (TID 
> 59, xk2, executor 1, partition 38, NODE_LOCAL, 7416 bytes)
> 17/01/18 16:28:49 INFO TaskSetManager: Finished task 30.0 in stage 8.0 (TID 
> 41) in 2224 ms on xk2 (executor 1) (36/200)
> 17/01/18 16:28:49 INFO TaskSetManager: Starting task 42.0 in stage 8.0 (TID 
> 60, xk2, executor 4, partition 42, NODE_LOCAL, 7416 bytes)
> 17/01/18 16:28:49 INFO TaskSetManager: Lost task 37.2 in stage 8.0 (TID 56) 
> on xk2, executor 4: org.apache.spark.SparkException (Task failed while 
> writing rows.) [duplicate 7]
> 17/01/18 16:28:49 INFO TaskSetManager: Starting task 37.3 in stage 8.0 (TID 
> 61, xk3, executor 3, partition 37, NODE_LOCAL, 7416 bytes)
> 17/01/18 16:28:49 INFO TaskSetManager: Lost task 41.0 in stage 8.0 (TID 57) 
> on xk3, executor 3: org.apache.spark.SparkException (Task failed while 
> writing rows.) [duplicate 8]
> 17/01/18 16:28:49 INFO TaskSetManager: Starting task 41.1 in stage 8.0 (TID 
> 62, xk3, executor 6, partition 41, NODE_LOCAL, 7416 bytes)
> 17/01/18 16:28:49 INFO TaskSetManager: Lost task 40.1 in stage 8.0 (TID 55) 
> on xk3, executor 6: org.apache.spark.SparkException (Task failed while 
> writing rows.) [duplicate 9]
> 17/01/18 16:28:49 INFO TaskSetManager: Starting task 40.2 in stage 8.0 (TID 
> 63, xk2, executor 4, partition 40, NODE_LOCAL, 7416 bytes)
> 17/01/18 16:28:49 INFO TaskSetManager: Lost task 42.0 in stage 8.0 (TID 60) 
> on xk2, executor 4: org.apache.spark.SparkException (Task failed while 
> writing rows.) [duplicate 10]
> 17/01/18 16:28:50 INFO TaskSetManager: Starting task 42.1 in stage 8.0 (TID 
> 64, xk2, executor 1, partition 42, NODE_LOCAL, 7416 bytes)
> 17/01/18 16:28:50 INFO TaskSetManager: Lost task 38.2 in stage 8.0 (TID 59) 
> on xk2, executor 1: org.apache.spark.SparkException (Task failed while 
> writing rows.) [duplicate 11]
> 17/01/18 16:28:50 INFO TaskSetManager: Starting task 38.3 in stage 8.0 (TID 
> 65, xk3, executor 6, partition 38, NODE_LOCAL, 7416 bytes)
> 17/01/18 16:28:50 INFO TaskSetManager: Lost task 41.1 in stage 8.0 (TID 62) 
> on xk3, executor 6: org.apache.spark.SparkException (Task failed while 
> writing rows.) [duplicate 12]
> 17/01/18 16:28:50 INFO TaskSetManager: Starting task 41.2 in stage 8.0 (TID 
> 66, xk3, executor 2, partition 41, NODE_LOCAL, 7416 bytes)
> 17/01/18 16:28:50 INFO TaskSetManager: Lost task 39.2 in stage 8.0 (TID 58) 
> on xk3, executor 2: org.apache.spark.SparkException (Task failed while 
> writing rows.) [duplicate 13]
> 17/01/18 16:28:50 INFO TaskSetManager: Starting task 39.3 in stage 8.0 (TID 
> 67, xk3, executor 3, partition 39, NODE_LOCAL, 7416 bytes)
> 17/01/18 16:28:50 INFO TaskSetManager: Lost task 37.3 in stage 8.0 (TID 61) 
> on xk3, executor 3: org.apache.spark.SparkException (Task failed while 
> writing rows.) [duplicate 14]
> 17/01/18 16:28:50 ERROR TaskSetManager: Task 37 in stage 8.0 failed 4 times; 
> aborting job
> 17/01/18 16:28:50 INFO YarnScheduler: Cancelling stage 8
> 17/01/18 16:28:50 INFO YarnScheduler: Stage 8 was cancelled
> 17/01/18 16:28:50 INFO DAGScheduler: ResultStage 8 (processCmd at 
> CliDriver.java:377) failed in 13.292 s due to Job aborted due to stage 
> failure: Task 37 in stage 8.0 failed 4 times, most recent failure: Lost task 
> 37.3 in stage 8.0 (TID 61, xk3, executor 3): org.apache.spark.SparkException: 
> Task failed while writing rows.
>         at 
> org.apache.spark.sql.hive.SparkHiveDynamicPartitionWriterContainer.writeToFile(hiveWriterContainers.scala:328)
>         at 
> org.apache.spark.sql.hive.execution.InsertIntoHiveTable$$anonfun$saveAsHiveFile$3.apply(InsertIntoHiveTable.scala:210)
>         at 
> org.apache.spark.sql.hive.execution.InsertIntoHiveTable$$anonfun$saveAsHiveFile$3.apply(InsertIntoHiveTable.scala:210)
>         at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87)
>         at org.apache.spark.scheduler.Task.run(Task.scala:99)
>         at 
> org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322)
>         at 
> java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
>         at 
> java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
>         at java.lang.Thread.run(Thread.java:745)
> Caused by: org.apache.spark.shuffle.FetchFailedException: 
> java.io.FileNotFoundException: 
> /srv/BigData/hadoop/data1/nm/localdir/usercache/super/appcache/application_1484570747988_0012/blockmgr-3c7cc8f2-a11e-4fd4-b671-b8c7db6132ca/32/shuffle_1_1_0.index
>  (No such file or directory)
> ```



--
This message was sent by Atlassian JIRA
(v6.3.4#6332)

---------------------------------------------------------------------
To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org
For additional commands, e-mail: issues-h...@spark.apache.org

[jira] [Updated] (SPARK-19273) Stage is not retay when shuffle file is lost

Reply via email to