[ https://issues.apache.org/jira/browse/SPARK-19273?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
xukun updated SPARK-19273: -------------------------- Description: Execute "insert into table select * from a join b on a.xx = b.xx". In Shuffle stage ,we delete shuffle file, shuffle stage will not retry and job fail because task fail 4 times. detail: create two external table use tpcds: {quote} create external table date_dim ( d_date_sk int, d_date_id string, d_date string, d_month_seq int, d_week_seq int, d_quarter_seq int, d_year int, d_dow int, d_moy int, d_dom int, d_qoy int, d_fy_year int, d_fy_quarter_seq int, d_fy_week_seq int, d_day_name string, d_quarter_name string, d_holiday string, d_weekend string, d_following_holiday string, d_first_dom int, d_last_dom int, d_same_day_ly int, d_same_day_lq int, d_current_day string, d_current_week string, d_current_month string, d_current_quarter string, d_current_year string ) row format delimited fields terminated by '|' location 'path1'; create external table web_sales ( ws_sold_date_sk int, ws_sold_time_sk int, ws_ship_date_sk int, ws_item_sk int, ws_bill_customer_sk int, ws_bill_cdemo_sk int, ws_bill_hdemo_sk int, ws_bill_addr_sk int, ws_ship_customer_sk int, ws_ship_cdemo_sk int, ws_ship_hdemo_sk int, ws_ship_addr_sk int, ws_web_page_sk int, ws_web_site_sk int, ws_ship_mode_sk int, ws_warehouse_sk int, ws_promo_sk int, ws_order_number int, ws_quantity int, ws_wholesale_cost float, ws_list_price float, ws_sales_price float, ws_ext_discount_amt float, ws_ext_sales_price float, ws_ext_wholesale_cost float, ws_ext_list_price float, ws_ext_tax float, ws_coupon_amt float, ws_ext_ship_cost float, ws_net_paid float, ws_net_paid_inc_tax float, ws_net_paid_inc_ship float, ws_net_paid_inc_ship_tax float, ws_net_profit float ) row format delimited fields terminated by '|' location 'path2'; {quote} then execute sql like this: {quote} create table web_sales1 ( ws_sold_date_sk int, ws_sold_time_sk int, ws_ship_date_sk int, ws_item_sk int, ws_bill_customer_sk int, ws_bill_cdemo_sk int, ws_bill_hdemo_sk int, ws_bill_addr_sk int, ws_ship_customer_sk int, ws_ship_cdemo_sk int, ws_ship_hdemo_sk int, ws_ship_addr_sk int, ws_web_page_sk int, ws_web_site_sk int, ws_ship_mode_sk int, ws_warehouse_sk int, ws_promo_sk int, ws_order_number int, ws_quantity int, ws_wholesale_cost float, ws_list_price float, ws_sales_price float, ws_ext_discount_amt float, ws_ext_sales_price float, ws_ext_wholesale_cost float, ws_ext_list_price float, ws_ext_tax float, ws_coupon_amt float, ws_ext_ship_cost float, ws_net_paid float, ws_net_paid_inc_tax float, ws_net_paid_inc_ship float, ws_net_paid_inc_ship_tax float, ws_net_profit float ) partitioned by (ws_sold_date string) stored as parquet; set hive.exec.dynamic.partition=true; set hive.exec.dynamic.partition.mode=nonstrict; set hive.exec.max.dynamic.partitions=100; set spark.sql.autoBroadcastJoinThreshold = 1; insert overwrite table web_sales1 partition (ws_sold_date) select ws.ws_sold_date_sk, ws.ws_sold_time_sk, ws.ws_ship_date_sk, ws.ws_item_sk, ws.ws_bill_customer_sk, ws.ws_bill_cdemo_sk, ws.ws_bill_hdemo_sk, ws.ws_bill_addr_sk, ws.ws_ship_customer_sk, ws.ws_ship_cdemo_sk, ws.ws_ship_hdemo_sk, ws.ws_ship_addr_sk, ws.ws_web_page_sk, ws.ws_web_site_sk, ws.ws_ship_mode_sk, ws.ws_warehouse_sk, ws.ws_promo_sk, ws.ws_order_number, ws.ws_quantity, ws.ws_wholesale_cost, ws.ws_list_price, ws.ws_sales_price, ws.ws_ext_discount_amt, ws.ws_ext_sales_price, ws.ws_ext_wholesale_cost, ws.ws_ext_list_price, ws.ws_ext_tax, ws.ws_coupon_amt, ws.ws_ext_ship_cost, ws.ws_net_paid, ws.ws_net_paid_inc_tax, ws.ws_net_paid_inc_ship, ws.ws_net_paid_inc_ship_tax, ws.ws_net_profit, dd.d_date as ws_sold_date from tpcds_text.web_sales ws join tpcds_text.date_dim dd on (ws.ws_sold_date_sk = dd.d_date_sk); {quote} after map stage, delete executor shuffle file, the job will fail. log is: {quote} 17/01/18 16:28:49 INFO TaskSetManager: Starting task 37.1 in stage 8.0 (TID 52, xk3, executor 6, partition 37, NODE_LOCAL, 7416 bytes) 17/01/18 16:28:49 INFO TaskSetManager: Lost task 38.0 in stage 8.0 (TID 49) on xk3, executor 6: org.apache.spark.SparkException (Task failed while writing rows.) [duplicate 1] 17/01/18 16:28:49 INFO TaskSetManager: Starting task 38.1 in stage 8.0 (TID 53, xk3, executor 2, partition 38, NODE_LOCAL, 7416 bytes) 17/01/18 16:28:49 INFO TaskSetManager: Lost task 39.0 in stage 8.0 (TID 50) on xk3, executor 2: org.apache.spark.SparkException (Task failed while writing rows.) [duplicate 2] 17/01/18 16:28:49 INFO TaskSetManager: Starting task 39.1 in stage 8.0 (TID 54, xk3, executor 3, partition 39, NODE_LOCAL, 7416 bytes) 17/01/18 16:28:49 INFO TaskSetManager: Lost task 40.0 in stage 8.0 (TID 51) on xk3, executor 3: org.apache.spark.SparkException (Task failed while writing rows.) [duplicate 3] 17/01/18 16:28:49 INFO TaskSetManager: Starting task 40.1 in stage 8.0 (TID 55, xk3, executor 6, partition 40, NODE_LOCAL, 7416 bytes) 17/01/18 16:28:49 INFO TaskSetManager: Lost task 37.1 in stage 8.0 (TID 52) on xk3, executor 6: org.apache.spark.SparkException (Task failed while writing rows.) [duplicate 4] 17/01/18 16:28:49 INFO TaskSetManager: Starting task 37.2 in stage 8.0 (TID 56, xk2, executor 4, partition 37, NODE_LOCAL, 7416 bytes) 17/01/18 16:28:49 INFO TaskSetManager: Finished task 34.0 in stage 8.0 (TID 45) in 1209 ms on xk2 (executor 4) (35/200) 17/01/18 16:28:49 INFO TaskSetManager: Starting task 41.0 in stage 8.0 (TID 57, xk3, executor 3, partition 41, NODE_LOCAL, 7416 bytes) 17/01/18 16:28:49 INFO TaskSetManager: Lost task 39.1 in stage 8.0 (TID 54) on xk3, executor 3: org.apache.spark.SparkException (Task failed while writing rows.) [duplicate 5] 17/01/18 16:28:49 INFO TaskSetManager: Starting task 39.2 in stage 8.0 (TID 58, xk3, executor 2, partition 39, NODE_LOCAL, 7416 bytes) 17/01/18 16:28:49 INFO TaskSetManager: Lost task 38.1 in stage 8.0 (TID 53) on xk3, executor 2: org.apache.spark.SparkException (Task failed while writing rows.) [duplicate 6] 17/01/18 16:28:49 INFO TaskSetManager: Starting task 38.2 in stage 8.0 (TID 59, xk2, executor 1, partition 38, NODE_LOCAL, 7416 bytes) 17/01/18 16:28:49 INFO TaskSetManager: Finished task 30.0 in stage 8.0 (TID 41) in 2224 ms on xk2 (executor 1) (36/200) 17/01/18 16:28:49 INFO TaskSetManager: Starting task 42.0 in stage 8.0 (TID 60, xk2, executor 4, partition 42, NODE_LOCAL, 7416 bytes) 17/01/18 16:28:49 INFO TaskSetManager: Lost task 37.2 in stage 8.0 (TID 56) on xk2, executor 4: org.apache.spark.SparkException (Task failed while writing rows.) [duplicate 7] 17/01/18 16:28:49 INFO TaskSetManager: Starting task 37.3 in stage 8.0 (TID 61, xk3, executor 3, partition 37, NODE_LOCAL, 7416 bytes) 17/01/18 16:28:49 INFO TaskSetManager: Lost task 41.0 in stage 8.0 (TID 57) on xk3, executor 3: org.apache.spark.SparkException (Task failed while writing rows.) [duplicate 8] 17/01/18 16:28:49 INFO TaskSetManager: Starting task 41.1 in stage 8.0 (TID 62, xk3, executor 6, partition 41, NODE_LOCAL, 7416 bytes) 17/01/18 16:28:49 INFO TaskSetManager: Lost task 40.1 in stage 8.0 (TID 55) on xk3, executor 6: org.apache.spark.SparkException (Task failed while writing rows.) [duplicate 9] 17/01/18 16:28:49 INFO TaskSetManager: Starting task 40.2 in stage 8.0 (TID 63, xk2, executor 4, partition 40, NODE_LOCAL, 7416 bytes) 17/01/18 16:28:49 INFO TaskSetManager: Lost task 42.0 in stage 8.0 (TID 60) on xk2, executor 4: org.apache.spark.SparkException (Task failed while writing rows.) [duplicate 10] 17/01/18 16:28:50 INFO TaskSetManager: Starting task 42.1 in stage 8.0 (TID 64, xk2, executor 1, partition 42, NODE_LOCAL, 7416 bytes) 17/01/18 16:28:50 INFO TaskSetManager: Lost task 38.2 in stage 8.0 (TID 59) on xk2, executor 1: org.apache.spark.SparkException (Task failed while writing rows.) [duplicate 11] 17/01/18 16:28:50 INFO TaskSetManager: Starting task 38.3 in stage 8.0 (TID 65, xk3, executor 6, partition 38, NODE_LOCAL, 7416 bytes) 17/01/18 16:28:50 INFO TaskSetManager: Lost task 41.1 in stage 8.0 (TID 62) on xk3, executor 6: org.apache.spark.SparkException (Task failed while writing rows.) [duplicate 12] 17/01/18 16:28:50 INFO TaskSetManager: Starting task 41.2 in stage 8.0 (TID 66, xk3, executor 2, partition 41, NODE_LOCAL, 7416 bytes) 17/01/18 16:28:50 INFO TaskSetManager: Lost task 39.2 in stage 8.0 (TID 58) on xk3, executor 2: org.apache.spark.SparkException (Task failed while writing rows.) [duplicate 13] 17/01/18 16:28:50 INFO TaskSetManager: Starting task 39.3 in stage 8.0 (TID 67, xk3, executor 3, partition 39, NODE_LOCAL, 7416 bytes) 17/01/18 16:28:50 INFO TaskSetManager: Lost task 37.3 in stage 8.0 (TID 61) on xk3, executor 3: org.apache.spark.SparkException (Task failed while writing rows.) [duplicate 14] 17/01/18 16:28:50 ERROR TaskSetManager: Task 37 in stage 8.0 failed 4 times; aborting job 17/01/18 16:28:50 INFO YarnScheduler: Cancelling stage 8 17/01/18 16:28:50 INFO YarnScheduler: Stage 8 was cancelled 17/01/18 16:28:50 INFO DAGScheduler: ResultStage 8 (processCmd at CliDriver.java:377) failed in 13.292 s due to Job aborted due to stage failure: Task 37 in stage 8.0 failed 4 times, most recent failure: Lost task 37.3 in stage 8.0 (TID 61, xk3, executor 3): org.apache.spark.SparkException: Task failed while writing rows. at org.apache.spark.sql.hive.SparkHiveDynamicPartitionWriterContainer.writeToFile(hiveWriterContainers.scala:328) at org.apache.spark.sql.hive.execution.InsertIntoHiveTable$$anonfun$saveAsHiveFile$3.apply(InsertIntoHiveTable.scala:210) at org.apache.spark.sql.hive.execution.InsertIntoHiveTable$$anonfun$saveAsHiveFile$3.apply(InsertIntoHiveTable.scala:210) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) at org.apache.spark.scheduler.Task.run(Task.scala:99) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745) Caused by: org.apache.spark.shuffle.FetchFailedException: java.io.FileNotFoundException: /srv/BigData/hadoop/data1/nm/localdir/usercache/super/appcache/application_1484570747988_0012/blockmgr-3c7cc8f2-a11e-4fd4-b671-b8c7db6132ca/32/shuffle_1_1_0.index (No such file or directory) {quote} was: Execute "insert into table select * from a join b on a.xx = b.xx". In Shuffle stage ,we delete shuffle file, shuffle stage will not retry and job fail because task fail 4 times. detail: create two external table use tpcds: ``` create external table date_dim ( d_date_sk int, d_date_id string, d_date string, d_month_seq int, d_week_seq int, d_quarter_seq int, d_year int, d_dow int, d_moy int, d_dom int, d_qoy int, d_fy_year int, d_fy_quarter_seq int, d_fy_week_seq int, d_day_name string, d_quarter_name string, d_holiday string, d_weekend string, d_following_holiday string, d_first_dom int, d_last_dom int, d_same_day_ly int, d_same_day_lq int, d_current_day string, d_current_week string, d_current_month string, d_current_quarter string, d_current_year string ) row format delimited fields terminated by '|' location 'path1'; create external table web_sales ( ws_sold_date_sk int, ws_sold_time_sk int, ws_ship_date_sk int, ws_item_sk int, ws_bill_customer_sk int, ws_bill_cdemo_sk int, ws_bill_hdemo_sk int, ws_bill_addr_sk int, ws_ship_customer_sk int, ws_ship_cdemo_sk int, ws_ship_hdemo_sk int, ws_ship_addr_sk int, ws_web_page_sk int, ws_web_site_sk int, ws_ship_mode_sk int, ws_warehouse_sk int, ws_promo_sk int, ws_order_number int, ws_quantity int, ws_wholesale_cost float, ws_list_price float, ws_sales_price float, ws_ext_discount_amt float, ws_ext_sales_price float, ws_ext_wholesale_cost float, ws_ext_list_price float, ws_ext_tax float, ws_coupon_amt float, ws_ext_ship_cost float, ws_net_paid float, ws_net_paid_inc_tax float, ws_net_paid_inc_ship float, ws_net_paid_inc_ship_tax float, ws_net_profit float ) row format delimited fields terminated by '|' location 'path2'; ``` then execute sql like this: ``` create table web_sales1 ( ws_sold_date_sk int, ws_sold_time_sk int, ws_ship_date_sk int, ws_item_sk int, ws_bill_customer_sk int, ws_bill_cdemo_sk int, ws_bill_hdemo_sk int, ws_bill_addr_sk int, ws_ship_customer_sk int, ws_ship_cdemo_sk int, ws_ship_hdemo_sk int, ws_ship_addr_sk int, ws_web_page_sk int, ws_web_site_sk int, ws_ship_mode_sk int, ws_warehouse_sk int, ws_promo_sk int, ws_order_number int, ws_quantity int, ws_wholesale_cost float, ws_list_price float, ws_sales_price float, ws_ext_discount_amt float, ws_ext_sales_price float, ws_ext_wholesale_cost float, ws_ext_list_price float, ws_ext_tax float, ws_coupon_amt float, ws_ext_ship_cost float, ws_net_paid float, ws_net_paid_inc_tax float, ws_net_paid_inc_ship float, ws_net_paid_inc_ship_tax float, ws_net_profit float ) partitioned by (ws_sold_date string) stored as parquet; set hive.exec.dynamic.partition=true; set hive.exec.dynamic.partition.mode=nonstrict; set hive.exec.max.dynamic.partitions=100; set spark.sql.autoBroadcastJoinThreshold = 1; insert overwrite table web_sales1 partition (ws_sold_date) select ws.ws_sold_date_sk, ws.ws_sold_time_sk, ws.ws_ship_date_sk, ws.ws_item_sk, ws.ws_bill_customer_sk, ws.ws_bill_cdemo_sk, ws.ws_bill_hdemo_sk, ws.ws_bill_addr_sk, ws.ws_ship_customer_sk, ws.ws_ship_cdemo_sk, ws.ws_ship_hdemo_sk, ws.ws_ship_addr_sk, ws.ws_web_page_sk, ws.ws_web_site_sk, ws.ws_ship_mode_sk, ws.ws_warehouse_sk, ws.ws_promo_sk, ws.ws_order_number, ws.ws_quantity, ws.ws_wholesale_cost, ws.ws_list_price, ws.ws_sales_price, ws.ws_ext_discount_amt, ws.ws_ext_sales_price, ws.ws_ext_wholesale_cost, ws.ws_ext_list_price, ws.ws_ext_tax, ws.ws_coupon_amt, ws.ws_ext_ship_cost, ws.ws_net_paid, ws.ws_net_paid_inc_tax, ws.ws_net_paid_inc_ship, ws.ws_net_paid_inc_ship_tax, ws.ws_net_profit, dd.d_date as ws_sold_date from tpcds_text.web_sales ws join tpcds_text.date_dim dd on (ws.ws_sold_date_sk = dd.d_date_sk); ``` after map stage, delete executor shuffle file, the job will fail. log is: ``` 17/01/18 16:28:49 INFO TaskSetManager: Starting task 37.1 in stage 8.0 (TID 52, xk3, executor 6, partition 37, NODE_LOCAL, 7416 bytes) 17/01/18 16:28:49 INFO TaskSetManager: Lost task 38.0 in stage 8.0 (TID 49) on xk3, executor 6: org.apache.spark.SparkException (Task failed while writing rows.) [duplicate 1] 17/01/18 16:28:49 INFO TaskSetManager: Starting task 38.1 in stage 8.0 (TID 53, xk3, executor 2, partition 38, NODE_LOCAL, 7416 bytes) 17/01/18 16:28:49 INFO TaskSetManager: Lost task 39.0 in stage 8.0 (TID 50) on xk3, executor 2: org.apache.spark.SparkException (Task failed while writing rows.) [duplicate 2] 17/01/18 16:28:49 INFO TaskSetManager: Starting task 39.1 in stage 8.0 (TID 54, xk3, executor 3, partition 39, NODE_LOCAL, 7416 bytes) 17/01/18 16:28:49 INFO TaskSetManager: Lost task 40.0 in stage 8.0 (TID 51) on xk3, executor 3: org.apache.spark.SparkException (Task failed while writing rows.) [duplicate 3] 17/01/18 16:28:49 INFO TaskSetManager: Starting task 40.1 in stage 8.0 (TID 55, xk3, executor 6, partition 40, NODE_LOCAL, 7416 bytes) 17/01/18 16:28:49 INFO TaskSetManager: Lost task 37.1 in stage 8.0 (TID 52) on xk3, executor 6: org.apache.spark.SparkException (Task failed while writing rows.) [duplicate 4] 17/01/18 16:28:49 INFO TaskSetManager: Starting task 37.2 in stage 8.0 (TID 56, xk2, executor 4, partition 37, NODE_LOCAL, 7416 bytes) 17/01/18 16:28:49 INFO TaskSetManager: Finished task 34.0 in stage 8.0 (TID 45) in 1209 ms on xk2 (executor 4) (35/200) 17/01/18 16:28:49 INFO TaskSetManager: Starting task 41.0 in stage 8.0 (TID 57, xk3, executor 3, partition 41, NODE_LOCAL, 7416 bytes) 17/01/18 16:28:49 INFO TaskSetManager: Lost task 39.1 in stage 8.0 (TID 54) on xk3, executor 3: org.apache.spark.SparkException (Task failed while writing rows.) [duplicate 5] 17/01/18 16:28:49 INFO TaskSetManager: Starting task 39.2 in stage 8.0 (TID 58, xk3, executor 2, partition 39, NODE_LOCAL, 7416 bytes) 17/01/18 16:28:49 INFO TaskSetManager: Lost task 38.1 in stage 8.0 (TID 53) on xk3, executor 2: org.apache.spark.SparkException (Task failed while writing rows.) [duplicate 6] 17/01/18 16:28:49 INFO TaskSetManager: Starting task 38.2 in stage 8.0 (TID 59, xk2, executor 1, partition 38, NODE_LOCAL, 7416 bytes) 17/01/18 16:28:49 INFO TaskSetManager: Finished task 30.0 in stage 8.0 (TID 41) in 2224 ms on xk2 (executor 1) (36/200) 17/01/18 16:28:49 INFO TaskSetManager: Starting task 42.0 in stage 8.0 (TID 60, xk2, executor 4, partition 42, NODE_LOCAL, 7416 bytes) 17/01/18 16:28:49 INFO TaskSetManager: Lost task 37.2 in stage 8.0 (TID 56) on xk2, executor 4: org.apache.spark.SparkException (Task failed while writing rows.) [duplicate 7] 17/01/18 16:28:49 INFO TaskSetManager: Starting task 37.3 in stage 8.0 (TID 61, xk3, executor 3, partition 37, NODE_LOCAL, 7416 bytes) 17/01/18 16:28:49 INFO TaskSetManager: Lost task 41.0 in stage 8.0 (TID 57) on xk3, executor 3: org.apache.spark.SparkException (Task failed while writing rows.) [duplicate 8] 17/01/18 16:28:49 INFO TaskSetManager: Starting task 41.1 in stage 8.0 (TID 62, xk3, executor 6, partition 41, NODE_LOCAL, 7416 bytes) 17/01/18 16:28:49 INFO TaskSetManager: Lost task 40.1 in stage 8.0 (TID 55) on xk3, executor 6: org.apache.spark.SparkException (Task failed while writing rows.) [duplicate 9] 17/01/18 16:28:49 INFO TaskSetManager: Starting task 40.2 in stage 8.0 (TID 63, xk2, executor 4, partition 40, NODE_LOCAL, 7416 bytes) 17/01/18 16:28:49 INFO TaskSetManager: Lost task 42.0 in stage 8.0 (TID 60) on xk2, executor 4: org.apache.spark.SparkException (Task failed while writing rows.) [duplicate 10] 17/01/18 16:28:50 INFO TaskSetManager: Starting task 42.1 in stage 8.0 (TID 64, xk2, executor 1, partition 42, NODE_LOCAL, 7416 bytes) 17/01/18 16:28:50 INFO TaskSetManager: Lost task 38.2 in stage 8.0 (TID 59) on xk2, executor 1: org.apache.spark.SparkException (Task failed while writing rows.) [duplicate 11] 17/01/18 16:28:50 INFO TaskSetManager: Starting task 38.3 in stage 8.0 (TID 65, xk3, executor 6, partition 38, NODE_LOCAL, 7416 bytes) 17/01/18 16:28:50 INFO TaskSetManager: Lost task 41.1 in stage 8.0 (TID 62) on xk3, executor 6: org.apache.spark.SparkException (Task failed while writing rows.) [duplicate 12] 17/01/18 16:28:50 INFO TaskSetManager: Starting task 41.2 in stage 8.0 (TID 66, xk3, executor 2, partition 41, NODE_LOCAL, 7416 bytes) 17/01/18 16:28:50 INFO TaskSetManager: Lost task 39.2 in stage 8.0 (TID 58) on xk3, executor 2: org.apache.spark.SparkException (Task failed while writing rows.) [duplicate 13] 17/01/18 16:28:50 INFO TaskSetManager: Starting task 39.3 in stage 8.0 (TID 67, xk3, executor 3, partition 39, NODE_LOCAL, 7416 bytes) 17/01/18 16:28:50 INFO TaskSetManager: Lost task 37.3 in stage 8.0 (TID 61) on xk3, executor 3: org.apache.spark.SparkException (Task failed while writing rows.) [duplicate 14] 17/01/18 16:28:50 ERROR TaskSetManager: Task 37 in stage 8.0 failed 4 times; aborting job 17/01/18 16:28:50 INFO YarnScheduler: Cancelling stage 8 17/01/18 16:28:50 INFO YarnScheduler: Stage 8 was cancelled 17/01/18 16:28:50 INFO DAGScheduler: ResultStage 8 (processCmd at CliDriver.java:377) failed in 13.292 s due to Job aborted due to stage failure: Task 37 in stage 8.0 failed 4 times, most recent failure: Lost task 37.3 in stage 8.0 (TID 61, xk3, executor 3): org.apache.spark.SparkException: Task failed while writing rows. at org.apache.spark.sql.hive.SparkHiveDynamicPartitionWriterContainer.writeToFile(hiveWriterContainers.scala:328) at org.apache.spark.sql.hive.execution.InsertIntoHiveTable$$anonfun$saveAsHiveFile$3.apply(InsertIntoHiveTable.scala:210) at org.apache.spark.sql.hive.execution.InsertIntoHiveTable$$anonfun$saveAsHiveFile$3.apply(InsertIntoHiveTable.scala:210) at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) at org.apache.spark.scheduler.Task.run(Task.scala:99) at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322) at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) at java.lang.Thread.run(Thread.java:745) Caused by: org.apache.spark.shuffle.FetchFailedException: java.io.FileNotFoundException: /srv/BigData/hadoop/data1/nm/localdir/usercache/super/appcache/application_1484570747988_0012/blockmgr-3c7cc8f2-a11e-4fd4-b671-b8c7db6132ca/32/shuffle_1_1_0.index (No such file or directory) ``` > Stage is not retay when shuffle file is lost > --------------------------------------------- > > Key: SPARK-19273 > URL: https://issues.apache.org/jira/browse/SPARK-19273 > Project: Spark > Issue Type: Bug > Components: SQL > Affects Versions: 2.1.0 > Reporter: xukun > > Execute "insert into table select * from a join b on a.xx = b.xx". In Shuffle > stage ,we delete shuffle file, shuffle stage will not retry and job fail > because task fail 4 times. > detail: > create two external table use tpcds: > {quote} > create external table date_dim > ( > d_date_sk int, > d_date_id string, > d_date string, > d_month_seq int, > d_week_seq int, > d_quarter_seq int, > d_year int, > d_dow int, > d_moy int, > d_dom int, > d_qoy int, > d_fy_year int, > d_fy_quarter_seq int, > d_fy_week_seq int, > d_day_name string, > d_quarter_name string, > d_holiday string, > d_weekend string, > d_following_holiday string, > d_first_dom int, > d_last_dom int, > d_same_day_ly int, > d_same_day_lq int, > d_current_day string, > d_current_week string, > d_current_month string, > d_current_quarter string, > d_current_year string > ) > row format delimited fields terminated by '|' > location 'path1'; > create external table web_sales > ( > ws_sold_date_sk int, > ws_sold_time_sk int, > ws_ship_date_sk int, > ws_item_sk int, > ws_bill_customer_sk int, > ws_bill_cdemo_sk int, > ws_bill_hdemo_sk int, > ws_bill_addr_sk int, > ws_ship_customer_sk int, > ws_ship_cdemo_sk int, > ws_ship_hdemo_sk int, > ws_ship_addr_sk int, > ws_web_page_sk int, > ws_web_site_sk int, > ws_ship_mode_sk int, > ws_warehouse_sk int, > ws_promo_sk int, > ws_order_number int, > ws_quantity int, > ws_wholesale_cost float, > ws_list_price float, > ws_sales_price float, > ws_ext_discount_amt float, > ws_ext_sales_price float, > ws_ext_wholesale_cost float, > ws_ext_list_price float, > ws_ext_tax float, > ws_coupon_amt float, > ws_ext_ship_cost float, > ws_net_paid float, > ws_net_paid_inc_tax float, > ws_net_paid_inc_ship float, > ws_net_paid_inc_ship_tax float, > ws_net_profit float > ) > row format delimited fields terminated by '|' > location 'path2'; > {quote} > then execute sql like this: > {quote} > create table web_sales1 > ( > ws_sold_date_sk int, > ws_sold_time_sk int, > ws_ship_date_sk int, > ws_item_sk int, > ws_bill_customer_sk int, > ws_bill_cdemo_sk int, > ws_bill_hdemo_sk int, > ws_bill_addr_sk int, > ws_ship_customer_sk int, > ws_ship_cdemo_sk int, > ws_ship_hdemo_sk int, > ws_ship_addr_sk int, > ws_web_page_sk int, > ws_web_site_sk int, > ws_ship_mode_sk int, > ws_warehouse_sk int, > ws_promo_sk int, > ws_order_number int, > ws_quantity int, > ws_wholesale_cost float, > ws_list_price float, > ws_sales_price float, > ws_ext_discount_amt float, > ws_ext_sales_price float, > ws_ext_wholesale_cost float, > ws_ext_list_price float, > ws_ext_tax float, > ws_coupon_amt float, > ws_ext_ship_cost float, > ws_net_paid float, > ws_net_paid_inc_tax float, > ws_net_paid_inc_ship float, > ws_net_paid_inc_ship_tax float, > ws_net_profit float > ) > partitioned by (ws_sold_date string) > stored as parquet; > set hive.exec.dynamic.partition=true; > set hive.exec.dynamic.partition.mode=nonstrict; > set hive.exec.max.dynamic.partitions=100; > set spark.sql.autoBroadcastJoinThreshold = 1; > insert overwrite table web_sales1 partition (ws_sold_date) > select > ws.ws_sold_date_sk, > ws.ws_sold_time_sk, > ws.ws_ship_date_sk, > ws.ws_item_sk, > ws.ws_bill_customer_sk, > ws.ws_bill_cdemo_sk, > ws.ws_bill_hdemo_sk, > ws.ws_bill_addr_sk, > ws.ws_ship_customer_sk, > ws.ws_ship_cdemo_sk, > ws.ws_ship_hdemo_sk, > ws.ws_ship_addr_sk, > ws.ws_web_page_sk, > ws.ws_web_site_sk, > ws.ws_ship_mode_sk, > ws.ws_warehouse_sk, > ws.ws_promo_sk, > ws.ws_order_number, > ws.ws_quantity, > ws.ws_wholesale_cost, > ws.ws_list_price, > ws.ws_sales_price, > ws.ws_ext_discount_amt, > ws.ws_ext_sales_price, > ws.ws_ext_wholesale_cost, > ws.ws_ext_list_price, > ws.ws_ext_tax, > ws.ws_coupon_amt, > ws.ws_ext_ship_cost, > ws.ws_net_paid, > ws.ws_net_paid_inc_tax, > ws.ws_net_paid_inc_ship, > ws.ws_net_paid_inc_ship_tax, > ws.ws_net_profit, > dd.d_date as ws_sold_date > from tpcds_text.web_sales ws > join tpcds_text.date_dim dd > on (ws.ws_sold_date_sk = dd.d_date_sk); > {quote} > after map stage, delete executor shuffle file, the job will fail. log is: > {quote} > 17/01/18 16:28:49 INFO TaskSetManager: Starting task 37.1 in stage 8.0 (TID > 52, xk3, executor 6, partition 37, NODE_LOCAL, 7416 bytes) > 17/01/18 16:28:49 INFO TaskSetManager: Lost task 38.0 in stage 8.0 (TID 49) > on xk3, executor 6: org.apache.spark.SparkException (Task failed while > writing rows.) [duplicate 1] > 17/01/18 16:28:49 INFO TaskSetManager: Starting task 38.1 in stage 8.0 (TID > 53, xk3, executor 2, partition 38, NODE_LOCAL, 7416 bytes) > 17/01/18 16:28:49 INFO TaskSetManager: Lost task 39.0 in stage 8.0 (TID 50) > on xk3, executor 2: org.apache.spark.SparkException (Task failed while > writing rows.) [duplicate 2] > 17/01/18 16:28:49 INFO TaskSetManager: Starting task 39.1 in stage 8.0 (TID > 54, xk3, executor 3, partition 39, NODE_LOCAL, 7416 bytes) > 17/01/18 16:28:49 INFO TaskSetManager: Lost task 40.0 in stage 8.0 (TID 51) > on xk3, executor 3: org.apache.spark.SparkException (Task failed while > writing rows.) [duplicate 3] > 17/01/18 16:28:49 INFO TaskSetManager: Starting task 40.1 in stage 8.0 (TID > 55, xk3, executor 6, partition 40, NODE_LOCAL, 7416 bytes) > 17/01/18 16:28:49 INFO TaskSetManager: Lost task 37.1 in stage 8.0 (TID 52) > on xk3, executor 6: org.apache.spark.SparkException (Task failed while > writing rows.) [duplicate 4] > 17/01/18 16:28:49 INFO TaskSetManager: Starting task 37.2 in stage 8.0 (TID > 56, xk2, executor 4, partition 37, NODE_LOCAL, 7416 bytes) > 17/01/18 16:28:49 INFO TaskSetManager: Finished task 34.0 in stage 8.0 (TID > 45) in 1209 ms on xk2 (executor 4) (35/200) > 17/01/18 16:28:49 INFO TaskSetManager: Starting task 41.0 in stage 8.0 (TID > 57, xk3, executor 3, partition 41, NODE_LOCAL, 7416 bytes) > 17/01/18 16:28:49 INFO TaskSetManager: Lost task 39.1 in stage 8.0 (TID 54) > on xk3, executor 3: org.apache.spark.SparkException (Task failed while > writing rows.) [duplicate 5] > 17/01/18 16:28:49 INFO TaskSetManager: Starting task 39.2 in stage 8.0 (TID > 58, xk3, executor 2, partition 39, NODE_LOCAL, 7416 bytes) > 17/01/18 16:28:49 INFO TaskSetManager: Lost task 38.1 in stage 8.0 (TID 53) > on xk3, executor 2: org.apache.spark.SparkException (Task failed while > writing rows.) [duplicate 6] > 17/01/18 16:28:49 INFO TaskSetManager: Starting task 38.2 in stage 8.0 (TID > 59, xk2, executor 1, partition 38, NODE_LOCAL, 7416 bytes) > 17/01/18 16:28:49 INFO TaskSetManager: Finished task 30.0 in stage 8.0 (TID > 41) in 2224 ms on xk2 (executor 1) (36/200) > 17/01/18 16:28:49 INFO TaskSetManager: Starting task 42.0 in stage 8.0 (TID > 60, xk2, executor 4, partition 42, NODE_LOCAL, 7416 bytes) > 17/01/18 16:28:49 INFO TaskSetManager: Lost task 37.2 in stage 8.0 (TID 56) > on xk2, executor 4: org.apache.spark.SparkException (Task failed while > writing rows.) [duplicate 7] > 17/01/18 16:28:49 INFO TaskSetManager: Starting task 37.3 in stage 8.0 (TID > 61, xk3, executor 3, partition 37, NODE_LOCAL, 7416 bytes) > 17/01/18 16:28:49 INFO TaskSetManager: Lost task 41.0 in stage 8.0 (TID 57) > on xk3, executor 3: org.apache.spark.SparkException (Task failed while > writing rows.) [duplicate 8] > 17/01/18 16:28:49 INFO TaskSetManager: Starting task 41.1 in stage 8.0 (TID > 62, xk3, executor 6, partition 41, NODE_LOCAL, 7416 bytes) > 17/01/18 16:28:49 INFO TaskSetManager: Lost task 40.1 in stage 8.0 (TID 55) > on xk3, executor 6: org.apache.spark.SparkException (Task failed while > writing rows.) [duplicate 9] > 17/01/18 16:28:49 INFO TaskSetManager: Starting task 40.2 in stage 8.0 (TID > 63, xk2, executor 4, partition 40, NODE_LOCAL, 7416 bytes) > 17/01/18 16:28:49 INFO TaskSetManager: Lost task 42.0 in stage 8.0 (TID 60) > on xk2, executor 4: org.apache.spark.SparkException (Task failed while > writing rows.) [duplicate 10] > 17/01/18 16:28:50 INFO TaskSetManager: Starting task 42.1 in stage 8.0 (TID > 64, xk2, executor 1, partition 42, NODE_LOCAL, 7416 bytes) > 17/01/18 16:28:50 INFO TaskSetManager: Lost task 38.2 in stage 8.0 (TID 59) > on xk2, executor 1: org.apache.spark.SparkException (Task failed while > writing rows.) [duplicate 11] > 17/01/18 16:28:50 INFO TaskSetManager: Starting task 38.3 in stage 8.0 (TID > 65, xk3, executor 6, partition 38, NODE_LOCAL, 7416 bytes) > 17/01/18 16:28:50 INFO TaskSetManager: Lost task 41.1 in stage 8.0 (TID 62) > on xk3, executor 6: org.apache.spark.SparkException (Task failed while > writing rows.) [duplicate 12] > 17/01/18 16:28:50 INFO TaskSetManager: Starting task 41.2 in stage 8.0 (TID > 66, xk3, executor 2, partition 41, NODE_LOCAL, 7416 bytes) > 17/01/18 16:28:50 INFO TaskSetManager: Lost task 39.2 in stage 8.0 (TID 58) > on xk3, executor 2: org.apache.spark.SparkException (Task failed while > writing rows.) [duplicate 13] > 17/01/18 16:28:50 INFO TaskSetManager: Starting task 39.3 in stage 8.0 (TID > 67, xk3, executor 3, partition 39, NODE_LOCAL, 7416 bytes) > 17/01/18 16:28:50 INFO TaskSetManager: Lost task 37.3 in stage 8.0 (TID 61) > on xk3, executor 3: org.apache.spark.SparkException (Task failed while > writing rows.) [duplicate 14] > 17/01/18 16:28:50 ERROR TaskSetManager: Task 37 in stage 8.0 failed 4 times; > aborting job > 17/01/18 16:28:50 INFO YarnScheduler: Cancelling stage 8 > 17/01/18 16:28:50 INFO YarnScheduler: Stage 8 was cancelled > 17/01/18 16:28:50 INFO DAGScheduler: ResultStage 8 (processCmd at > CliDriver.java:377) failed in 13.292 s due to Job aborted due to stage > failure: Task 37 in stage 8.0 failed 4 times, most recent failure: Lost task > 37.3 in stage 8.0 (TID 61, xk3, executor 3): org.apache.spark.SparkException: > Task failed while writing rows. > at > org.apache.spark.sql.hive.SparkHiveDynamicPartitionWriterContainer.writeToFile(hiveWriterContainers.scala:328) > at > org.apache.spark.sql.hive.execution.InsertIntoHiveTable$$anonfun$saveAsHiveFile$3.apply(InsertIntoHiveTable.scala:210) > at > org.apache.spark.sql.hive.execution.InsertIntoHiveTable$$anonfun$saveAsHiveFile$3.apply(InsertIntoHiveTable.scala:210) > at org.apache.spark.scheduler.ResultTask.runTask(ResultTask.scala:87) > at org.apache.spark.scheduler.Task.run(Task.scala:99) > at > org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:322) > at > java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142) > at > java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617) > at java.lang.Thread.run(Thread.java:745) > Caused by: org.apache.spark.shuffle.FetchFailedException: > java.io.FileNotFoundException: > /srv/BigData/hadoop/data1/nm/localdir/usercache/super/appcache/application_1484570747988_0012/blockmgr-3c7cc8f2-a11e-4fd4-b671-b8c7db6132ca/32/shuffle_1_1_0.index > (No such file or directory) > {quote} -- This message was sent by Atlassian JIRA (v6.3.4#6332) --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@spark.apache.org For additional commands, e-mail: issues-h...@spark.apache.org