[ https://issues.apache.org/jira/browse/TEZ-1223?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel ]
Rajesh Balamohan updated TEZ-1223: ---------------------------------- Attachment: shuffle_data.tar.gz Update: ====== 1. When ShuffleHeader parsing error happened, I downloaded pending data from the InputStream (stored as corr_* in the attachment). Also, I downloaded original shuffle data by invoking the same shuffle url again in fetcher code (stored as original_* in the attachment). First line in "original_*" in the attachment has the shuffle URL with all map attempt ids. 2. "attempt_1399351577718_4326_1_05_000769_0_10033" which is listed in corr_* data is not even requested in original URL. This can be verified by looking at the shuffle URL. 3. Attached here is just a sample. I tried to correlate the same pattern across different shuffle exceptions in the job and all of them had the same issue. 4. This would be helpful in proving the point that NM/ShuffleHandler/Netty is jumbling up the responses across Channels (sort of race condition). Even though it would not cause any functionality error (as ShuffleHeader exceptions are thrown and data is fetched again), it would cause pressure on NM and wastage of resources. > Shuffle errors at 10 TB scale > ----------------------------- > > Key: TEZ-1223 > URL: https://issues.apache.org/jira/browse/TEZ-1223 > Project: Apache Tez > Issue Type: Bug > Reporter: Rajesh Balamohan > Assignee: Rajesh Balamohan > Labels: performance, scalability > Attachments: shuffle_data.tar.gz > > > When running a job with the following DAG at 10 TB scale, different shuffle > exceptions occurred. Creating this as umbrella ticket for tracking these > errors. Most of them are related to ShuffleHeader parsing. > DAG: > ===== > digraph rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1 > { graph [ > label="rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1", > fontsize=24, fontname=Helvetica]; node [fontsize=12, fontname=Helvetica]; > edge [fontsize=9, fontcolor=blue, fontname=Arial]; > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_5" [ label > = "Map_5[MapTezProcessor]" ]; > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_5" -> > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_3" [ > label = "[input=OnFileUnorderedKVOutput,\n output=ShuffledUnorderedKVInput,\n > dataMovement=BROADCAST,\n schedulingType=SEQUENTIAL]" ]; > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_9" [ > label = "Reducer_9[ReduceTezProcessor]" ]; > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_9" -> > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_3" [ > label = "[input=OnFileSortedOutput,\n output=ShuffledMergedInputLegacy,\n > dataMovement=SCATTER_GATHER,\n schedulingType=SEQUENTIAL]" ]; > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_11_store_returns" > [ label = "Map_11[store_returns]", shape = "box" ]; > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_11_store_returns" > -> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_11" [ > label = "Input [inputClass=MRInputLegacy,\n initializer=HiveSplitGenerator]" > ]; > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_4_out_Reducer_4" > [ label = "Reducer_4[out_Reducer_4]", shape = "box" ]; > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_10" [ label > = "Map_10[MapTezProcessor]" ]; > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_10" -> > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_3" [ > label = "[input=OnFileUnorderedKVOutput,\n output=ShuffledUnorderedKVInput,\n > dataMovement=BROADCAST,\n schedulingType=SEQUENTIAL]" ]; > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_8" [ label > = "Map_8[MapTezProcessor]" ]; > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_8" -> > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_9" [ > label = "[input=OnFileSortedOutput,\n output=ShuffledMergedInputLegacy,\n > dataMovement=SCATTER_GATHER,\n schedulingType=SEQUENTIAL]" ]; > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_1" [ label > = "Map_1[MapTezProcessor]" ]; > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_1" -> > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_3" [ > label = "[input=OnFileUnorderedKVOutput,\n output=ShuffledUnorderedKVInput,\n > dataMovement=BROADCAST,\n schedulingType=SEQUENTIAL]" ]; > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_6" [ label > = "Map_6[MapTezProcessor]" ]; > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_6" -> > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_3" [ > label = "[input=OnFileUnorderedKVOutput,\n output=ShuffledUnorderedKVInput,\n > dataMovement=BROADCAST,\n schedulingType=SEQUENTIAL]" ]; > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_10_item" [ > label = "Map_10[item]", shape = "box" ]; > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_10_item" -> > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_10" [ label > = "Input [inputClass=MRInputLegacy,\n initializer=HiveSplitGenerator]" ]; > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_6_d3" [ > label = "Map_6[d3]", shape = "box" ]; > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_6_d3" -> > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_6" [ label > = "Input [inputClass=MRInputLegacy,\n initializer=HiveSplitGenerator]" ]; > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_2_catalog_sales" > [ label = "Map_2[catalog_sales]", shape = "box" ]; > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_2_catalog_sales" > -> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_2" [ > label = "Input [inputClass=MRInputLegacy,\n initializer=HiveSplitGenerator]" > ]; "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_2" [ > label = "Map_2[MapTezProcessor]" ]; > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_2" -> > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_3" [ > label = "[input=OnFileSortedOutput,\n output=ShuffledMergedInputLegacy,\n > dataMovement=SCATTER_GATHER,\n schedulingType=SEQUENTIAL]" ]; > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_3" [ > label = "Reducer_3[ReduceTezProcessor]" ]; > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_3" -> > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_4" [ > label = "[input=OnFileSortedOutput,\n output=ShuffledMergedInputLegacy,\n > dataMovement=SCATTER_GATHER,\n schedulingType=SEQUENTIAL]" ]; > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_8_store_sales" > [ label = "Map_8[store_sales]", shape = "box" ]; > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_8_store_sales" > -> "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_8" [ > label = "Input [inputClass=MRInputLegacy,\n initializer=HiveSplitGenerator]" > ]; "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_7_store" > [ label = "Map_7[store]", shape = "box" ]; > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_7_store" -> > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_7" [ label > = "Input [inputClass=MRInputLegacy,\n initializer=HiveSplitGenerator]" ]; > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_5_d2" [ > label = "Map_5[d2]", shape = "box" ]; > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_5_d2" -> > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_5" [ label > = "Input [inputClass=MRInputLegacy,\n initializer=HiveSplitGenerator]" ]; > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_4" [ > label = "Reducer_4[ReduceTezProcessor]" ]; > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_4" -> > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_4_out_Reducer_4" > [ label = "Output [outputClass=MROutput,\n initializer=]" ]; > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_11" [ label > = "Map_11[MapTezProcessor]" ]; > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_11" -> > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_9" [ > label = "[input=OnFileSortedOutput,\n output=ShuffledMergedInputLegacy,\n > dataMovement=SCATTER_GATHER,\n schedulingType=SEQUENTIAL]" ]; > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_1_d1" [ > label = "Map_1[d1]", shape = "box" ]; > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_1_d1" -> > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_1" [ label > = "Input [inputClass=MRInputLegacy,\n initializer=HiveSplitGenerator]" ]; > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_7" [ label > = "Map_7[MapTezProcessor]" ]; > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Map_7" -> > "rajesh_20140622203232_d3d3d3ce_3d7a_4f04_ad05_31df915a74fd_1.Reducer_3" [ > label = "[input=OnFileUnorderedKVOutput,\n output=ShuffledUnorderedKVInput,\n > dataMovement=BROADCAST,\n schedulingType=SEQUENTIAL]" ]; } -- This message was sent by Atlassian JIRA (v6.2#6252)