[
https://issues.apache.org/jira/browse/IMPALA-11539?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17602507#comment-17602507
]
Michael Smith commented on IMPALA-11539:
----------------------------------------
The new test test_mt_dop_skew_lpt fails with Ozone
{code}
TestHdfsScannerSkew.test_mt_dop_skew_lpt[protocol: beeswax | exec_option:
{'test_replan': 1, 'batch_size': 0, 'num_nodes': 0,
'disable_codegen_rows_threshold': 0, 'disable_codegen': False,
'abort_on_error': 1, 'exec_single_node_rows_threshold': 0} | table_format:
text/none]
tests/query_test/test_scanners.py:442: in test_mt_dop_skew_lpt
assert cnt_fail < 3
E assert 5 < 3
------------------------------------------------------ Captured stderr setup
-------------------------------------------------------
SET
client_identifier=query_test/test_scanners.py::TestHdfsScannerSkew::()::test_mt_dop_skew_lpt[protocol:beeswax|exec_option:{'test_replan':1;'batch_size':0;'num_nodes':0;'disable_codegen_rows_threshold':0;'disable_codegen':False;'abort_on_error':1;'exec_single_node_rows_thre;
-- connecting to: localhost:21000
-- connecting to localhost:21050 with impyla
-- 2022-09-09 11:31:28,595 INFO MainThread: Closing active operation
-- connecting to localhost:28000 with impyla
-- 2022-09-09 11:31:28,606 INFO MainThread: Closing active operation
-- connecting to localhost:11050 with impyla
-- 2022-09-09 11:31:28,608 INFO MainThread: Could not connect to
('127.0.0.1', 11050)
Traceback (most recent call last):
File
"/home/michael/CDP_Impala_Ozone/toolchain/toolchain-packages-gcc7.5.0/thrift-0.11.0-p5/python/lib/python2.7/site-packages/thrift/transport/TSocket.py",
line 104, in open
handle.connect(sockaddr)
File
"/home/michael/CDP_Impala_Ozone/toolchain/toolchain-packages-gcc7.5.0/python-2.7.16/lib/python2.7/socket.py",
line 228, in meth
return getattr(self._sock,name)(*args)
error: [Errno 111] Connection refused
-- 2022-09-09 11:31:28,608 ERROR MainThread: Could not connect to any of
[('127.0.0.1', 11050)]
-- 2022-09-09 11:31:28,608 INFO MainThread: HS2 FENG connection setup
failed, continuing...: Could not connect to any of [('127.0.0.1', 11050)]
SET
client_identifier=query_test/test_scanners.py::TestHdfsScannerSkew::()::test_mt_dop_skew_lpt[protocol:beeswax|exec_option:{'test_replan':1;'batch_size':0;'num_nodes':0;'disable_codegen_rows_threshold':0;'disable_codegen':False;'abort_on_error':1;'exec_single_node_rows_thre;
SET sync_ddl=False;
-- executing against localhost:21000
DROP DATABASE IF EXISTS `test_mt_dop_skew_lpt_95cba3d6` CASCADE;
-- 2022-09-09 11:31:28,640 INFO MainThread: Started query
4b41d4f0fb16d5a8:e4d5a95e00000000
SET
client_identifier=query_test/test_scanners.py::TestHdfsScannerSkew::()::test_mt_dop_skew_lpt[protocol:beeswax|exec_option:{'test_replan':1;'batch_size':0;'num_nodes':0;'disable_codegen_rows_threshold':0;'disable_codegen':False;'abort_on_error':1;'exec_single_node_rows_thre;
SET sync_ddl=False;
-- executing against localhost:21000
CREATE DATABASE `test_mt_dop_skew_lpt_95cba3d6`;
-- 2022-09-09 11:31:29,393 INFO MainThread: Started query
2844251a67252207:9a8e607f00000000
-- 2022-09-09 11:31:29,598 INFO MainThread: Created database
"test_mt_dop_skew_lpt_95cba3d6" for test ID
"query_test/test_scanners.py::TestHdfsScannerSkew::()::test_mt_dop_skew_lpt[protocol:
beeswax | exec_option: {'test_replan': 1, 'batch_size': 0, 'num_nodes': 0,
'disable_codegen_rows_threshold': 0, 'disable_codegen': False,
'abort_on_error': 1, 'exec_single_node_rows_threshold': 0} | table_format:
text/none]"
------------------------------------------------------- Captured stderr call
-------------------------------------------------------
SET
client_identifier=query_test/test_scanners.py::TestHdfsScannerSkew::()::test_mt_dop_skew_lpt[protocol:beeswax|exec_option:{'test_replan':1;'batch_size':0;'num_nodes':0;'disable_codegen_rows_threshold':0;'disable_codegen':False;'abort_on_error':1;'exec_single_node_rows_thre;
-- connecting to: localhost:21000
SET mt_dop=2;
-- executing against localhost:21000
create table test_mt_dop_skew_lpt_95cba3d6.lineitem_skew like tpch.lineitem;
-- 2022-09-09 11:31:33,352 INFO MainThread: Started query
ec41751a233a6d2f:fd0fcd5c00000000
-- executing against localhost:21000
insert into test_mt_dop_skew_lpt_95cba3d6.lineitem_skew select * from
tpch.lineitem
where l_orderkey % 5 = 0;
-- 2022-09-09 11:31:37,523 INFO MainThread: Started query
f44901d4dddb2a6a:1185ccb300000000
-- executing against localhost:21000
insert into test_mt_dop_skew_lpt_95cba3d6.lineitem_skew select * from
tpch.lineitem
where l_orderkey % 5 = 0;
-- 2022-09-09 11:31:39,542 INFO MainThread: Started query
3f4490826ad73017:5f52d25000000000
-- executing against localhost:21000
insert into test_mt_dop_skew_lpt_95cba3d6.lineitem_skew select * from
tpch.lineitem
where l_orderkey % 5 = 0;
-- 2022-09-09 11:31:40,766 INFO MainThread: Started query
e248250f2614df91:3ff559ec00000000
-- executing against localhost:21000
insert into test_mt_dop_skew_lpt_95cba3d6.lineitem_skew select * from
tpch.lineitem
where l_orderkey % 5 = 0;
-- 2022-09-09 11:31:41,935 INFO MainThread: Started query
174105ef3a24f1ae:d8244ed400000000
-- executing against localhost:21000
insert into test_mt_dop_skew_lpt_95cba3d6.lineitem_skew select * from
tpch.lineitem;
-- 2022-09-09 11:31:43,107 INFO MainThread: Started query
c84939c297853093:4044015600000000
-- executing against localhost:21000
select min(l_orderkey),min(l_partkey),min(l_suppkey),min(l_linenumber),
min(l_quantity),min(l_extendedprice),min(l_discount),min(l_tax),
min(l_returnflag),min(l_linestatus),min(l_shipdate),min(l_commitdate),
min(l_receiptdate),min(l_shipinstruct),min(l_shipmode),min(l_comment)
from test_mt_dop_skew_lpt_95cba3d6.lineitem_skew;
-- 2022-09-09 11:31:46,705 INFO MainThread: Started query
6a4b1693735e777e:985fca5e00000000
-- executing against localhost:21000
select min(l_orderkey),min(l_partkey),min(l_suppkey),min(l_linenumber),
min(l_quantity),min(l_extendedprice),min(l_discount),min(l_tax),
min(l_returnflag),min(l_linestatus),min(l_shipdate),min(l_commitdate),
min(l_receiptdate),min(l_shipinstruct),min(l_shipmode),min(l_comment)
from test_mt_dop_skew_lpt_95cba3d6.lineitem_skew;
-- 2022-09-09 11:31:48,204 INFO MainThread: Started query
e54744ec169b0878:e99a28eb00000000
-- executing against localhost:21000
select min(l_orderkey),min(l_partkey),min(l_suppkey),min(l_linenumber),
min(l_quantity),min(l_extendedprice),min(l_discount),min(l_tax),
min(l_returnflag),min(l_linestatus),min(l_shipdate),min(l_commitdate),
min(l_receiptdate),min(l_shipinstruct),min(l_shipmode),min(l_comment)
from test_mt_dop_skew_lpt_95cba3d6.lineitem_skew;
-- 2022-09-09 11:31:49,638 INFO MainThread: Started query
fd4920394ca0e2fa:e38bcc0100000000
-- executing against localhost:21000
select min(l_orderkey),min(l_partkey),min(l_suppkey),min(l_linenumber),
min(l_quantity),min(l_extendedprice),min(l_discount),min(l_tax),
min(l_returnflag),min(l_linestatus),min(l_shipdate),min(l_commitdate),
min(l_receiptdate),min(l_shipinstruct),min(l_shipmode),min(l_comment)
from test_mt_dop_skew_lpt_95cba3d6.lineitem_skew;
-- 2022-09-09 11:31:51,078 INFO MainThread: Started query
7a4a8052f1532839:5fa87ca400000000
-- executing against localhost:21000
select min(l_orderkey),min(l_partkey),min(l_suppkey),min(l_linenumber),
min(l_quantity),min(l_extendedprice),min(l_discount),min(l_tax),
min(l_returnflag),min(l_linestatus),min(l_shipdate),min(l_commitdate),
min(l_receiptdate),min(l_shipinstruct),min(l_shipmode),min(l_comment)
from test_mt_dop_skew_lpt_95cba3d6.lineitem_skew;
-- 2022-09-09 11:31:52,626 INFO MainThread: Started query
264d6f30c96f3622:786ecd8b00000000
-- closing connection to: localhost:21000
{code}
> Mitigate intra-node skew of HDFS scans with MT_DOP
> --------------------------------------------------
>
> Key: IMPALA-11539
> URL: https://issues.apache.org/jira/browse/IMPALA-11539
> Project: IMPALA
> Issue Type: Bug
> Components: Backend
> Reporter: Zoltán Borók-Nagy
> Assignee: Zoltán Borók-Nagy
> Priority: Major
>
> Before IMPALA-9655 scan ranges were statically assigned to intra-node
> fragment instances based on Longest-Processing Time algorithm:
> https://github.com/apache/impala/blame/a7866a94578be6289bbac31686de4d9032ad9261/be/src/scheduling/scheduler.cc#L499-L501
> From IMPALA-9655 we use dynamic intra-node load balancing for HDFS scans. It
> means fragment instances have a shared queue of scan ranges and the fragment
> instances grab the next scan range to be read from this queue.
> IMPALA-9655 got rid of the LPT-algorithm which means the scan ranges are in
> a random order in the queue. This can lead to a skew if there are large scan
> ranges at the end.
> We could mix the above two by using a priority queue for the scan ranges, so
> each fragment instance would grab the largest scan range in the queue. This
> could further mitigate intra-node skewing.
--
This message was sent by Atlassian Jira
(v8.20.10#820010)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]