[
https://issues.apache.org/jira/browse/IMPALA-11665?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=17620584#comment-17620584
]
Qifan Chen commented on IMPALA-11665:
-------------------------------------
Setup a table with nulls and empty strings in the STRING columns. When loading,
configured the table with 1 page and 3 pages.
Ran the query in DML section below and observed the following when the fast
code path is taken.
1. Nulls are not part of the page min/max stats and min/max filter stats at
all, which is good;
2. The runtime filtering works as designed.
DDL
{code:java}
create table null_pq (
id string,
null_str string,
null_int int
)
sort by (null_str)
stored as parquet
;
{code}
data loading:
{code:java}
set PARQUET_PAGE_ROW_COUNT_LIMIT=12;
insert into null_pq values
('a', null, 1),
('b', null, 2),
('c',null,3),
('aa', 'a', 1),
('ab', 'b', 2),
('ac','c',3),
('ad', '', 4),
('ae', '', 5),
('ac','',6);
{code}
1 page case (set PARQUET_PAGE_ROW_COUNT_LIMIT=12)
{code:java}
[14:11:06 qchen@qifan-10229: src] pqtools dump
hdfs://localhost:20500/test-warehouse/null_pq/9341bc3df646c530-9701c2fc00000000_162963959_data.0.parq
22/10/17 14:23:15 INFO compress.CodecPool: Got brand-new decompressor [.snappy]
row group 0
--------------------------------------------------------------------------------
id: BINARY SNAPPY DO:4 FPO:56 SZ:85/89/1.05 VC:9 ENC:RLE,PLAIN_DICTIONARY
null_str: BINARY SNAPPY DO:146 FPO:180 SZ:64/60/0.94 VC:9 ENC:RLE,PLA [more]...
null_int: INT32 SNAPPY DO:273 FPO:312 SZ:72/68/0.94 VC:9 ENC:RLE,PLAI [more]...
id TV=9 RL=0 DL=1 DS: 8 DE:PLAIN_DICTIONARY
----------------------------------------------------------------------------
page 0: DLE:RLE RLE:RLE VLE:PLAIN_DICTIONARY
[more]... VC:9
null_str TV=9 RL=0 DL=1 DS: 4 DE:PLAIN_DICTIONARY
----------------------------------------------------------------------------
page 0: DLE:RLE RLE:RLE VLE:PLAIN_DICTIONARY
[more]... VC:9
null_int TV=9 RL=0 DL=1 DS: 6 DE:PLAIN_DICTIONARY
----------------------------------------------------------------------------
page 0: DLE:RLE RLE:RLE VLE:PLAIN_DICTIONARY
[more]... VC:9
BINARY id
--------------------------------------------------------------------------------
*** row group 1 of 1, values 1 to 9 ***
value 1: R:0 D:1 V:ad
value 2: R:0 D:1 V:ae
value 3: R:0 D:1 V:ac
value 4: R:0 D:1 V:aa
value 5: R:0 D:1 V:ab
value 6: R:0 D:1 V:ac
value 7: R:0 D:1 V:a
value 8: R:0 D:1 V:b
value 9: R:0 D:1 V:c
BINARY null_str
--------------------------------------------------------------------------------
*** row group 1 of 1, values 1 to 9 ***
value 1: R:0 D:1 V:
value 2: R:0 D:1 V:
value 3: R:0 D:1 V:
value 4: R:0 D:1 V:a
value 5: R:0 D:1 V:b
value 6: R:0 D:1 V:c
value 7: R:0 D:0 V:<null>
value 8: R:0 D:0 V:<null>
value 9: R:0 D:0 V:<null>
INT32 null_int
--------------------------------------------------------------------------------
*** row group 1 of 1, values 1 to 9 ***
value 1: R:0 D:1 V:4
value 2: R:0 D:1 V:5
value 3: R:0 D:1 V:6
value 4: R:0 D:1 V:1
value 5: R:0 D:1 V:2
value 6: R:0 D:1 V:3
value 7: R:0 D:1 V:1
value 8: R:0 D:1 V:2
value 9: R:0 D:1 V:3
[14:23:16 qchen@qifan-10229: src]
{code}
3 pages case (set PARQUET_PAGE_ROW_COUNT_LIMIT=4)
{code:java}
pqtools dump
hdfs://localhost:20500/test-warehouse/null_pq/aa449f944bb9d005-7df200e300000000_811956887_data.0.parq
[13:50:22 qchen@qifan-10229: cluster] pqtools dump
hdfs://localhost:20500/test-warehouse/null_pq/aa449f944bb9d005-7df200e300000000_811956887_data.0.parq
22/10/17 13:51:02 INFO compress.CodecPool: Got brand-new decompressor [.snappy]
row group 0
--------------------------------------------------------------------------------
id: BINARY SNAPPY DO:4 FPO:56 SZ:139/139/1.00 VC:9 ENC:RLE,PLAI [more]...
null_str: BINARY SNAPPY DO:200 FPO:234 SZ:116/108/0.93 VC:9 ENC:RLE,P [more]...
null_int: INT32 SNAPPY DO:388 FPO:427 SZ:126/118/0.94 VC:9 ENC:RLE,PL [more]...
id TV=9 RL=0 DL=1 DS: 8 DE:PLAIN_DICTIONARY
----------------------------------------------------------------------------
page 0: DLE:RLE RLE:RLE VLE:PLAIN_DICTIONARY
[more]... VC:4
page 1: DLE:RLE RLE:RLE VLE:PLAIN_DICTIONARY
[more]... VC:4
page 2: DLE:RLE RLE:RLE VLE:PLAIN_DICTIONARY
[more]... VC:1
null_str TV=9 RL=0 DL=1 DS: 4 DE:PLAIN_DICTIONARY
----------------------------------------------------------------------------
page 0: DLE:RLE RLE:RLE VLE:PLAIN_DICTIONARY
[more]... VC:4
page 1: DLE:RLE RLE:RLE VLE:PLAIN_DICTIONARY
[more]... VC:4
page 2: DLE:RLE RLE:RLE VLE:PLAIN ST:[no stat
[more]... VC:1
null_int TV=9 RL=0 DL=1 DS: 6 DE:PLAIN_DICTIONARY
----------------------------------------------------------------------------
page 0: DLE:RLE RLE:RLE VLE:PLAIN_DICTIONARY
[more]... VC:4
page 1: DLE:RLE RLE:RLE VLE:PLAIN_DICTIONARY
[more]... VC:4
page 2: DLE:RLE RLE:RLE VLE:PLAIN_DICTIONARY
[more]... VC:1
BINARY id
--------------------------------------------------------------------------------
*** row group 1 of 1, values 1 to 9 ***
value 1: R:0 D:1 V:ad
value 2: R:0 D:1 V:ae
value 3: R:0 D:1 V:ac
value 4: R:0 D:1 V:aa
value 5: R:0 D:1 V:ab
value 6: R:0 D:1 V:ac
value 7: R:0 D:1 V:a
value 8: R:0 D:1 V:b
value 9: R:0 D:1 V:c
BINARY null_str
--------------------------------------------------------------------------------
*** row group 1 of 1, values 1 to 9 ***
value 1: R:0 D:1 V:
value 2: R:0 D:1 V:
value 3: R:0 D:1 V:
value 4: R:0 D:1 V:a
value 5: R:0 D:1 V:b
value 6: R:0 D:1 V:c
value 7: R:0 D:0 V:<null>
value 8: R:0 D:0 V:<null>
value 9: R:0 D:0 V:<null>
INT32 null_int
--------------------------------------------------------------------------------
*** row group 1 of 1, values 1 to 9 ***
value 1: R:0 D:1 V:4
value 2: R:0 D:1 V:5
value 3: R:0 D:1 V:6
value 4: R:0 D:1 V:1
value 5: R:0 D:1 V:2
value 6: R:0 D:1 V:3
value 7: R:0 D:1 V:1
value 8: R:0 D:1 V:2
value 9: R:0 D:1 V:3
{code}
> Min/Max filter could crash in fast code path for string data type
> -----------------------------------------------------------------
>
> Key: IMPALA-11665
> URL: https://issues.apache.org/jira/browse/IMPALA-11665
> Project: IMPALA
> Issue Type: Bug
> Reporter: Abhishek Rawat
> Assignee: Qifan Chen
> Priority: Critical
>
> The impalad logs show that memcmp failed due to a segfault:
> {code:java}
> #
> # A fatal error has been detected by the Java Runtime Environment:
> #
> # SIGSEGV (0xb) at pc=0x00007f0396c3ff22, pid=1, tid=0x00007f023f365700
> #
> # JRE version: OpenJDK Runtime Environment (8.0_332-b09) (build 1.8.0_332-b09)
> # Java VM: OpenJDK 64-Bit Server VM (25.332-b09 mixed mode linux-amd64
> compressed oops)
> # Problematic frame:
> # C [libc.so.6+0x16af22] __memcmp_sse4_1+0xd42 {code}
> Resolved Stack Trace for the crashed thread:
> {code:java}
> Thread 530 (crashed)
> 0 libc-2.17.so + 0x16af22
> rax = 0x00007f61567715f0 rdx = 0x000000000000000a
> rcx = 0x00007f62ae04cf22 rbx = 0x0000000000000000
> rsi = 0x000000005d1e900a rdi = 0x000000000000000a
> rbp = 0x00007f6156771560 rsp = 0x00007f6156771548
> r8 = 0x00000000034d40f0 r9 = 0x00007f62ae022e90
> r10 = 0x000000000498ff6c r11 = 0x00007f62ae06f590
> r12 = 0x000000000000000a r13 = 0x000000001a9678e8
> r14 = 0x00007f6156771730 r15 = 0x0000000001b1f380
> rip = 0x00007f62ae04cf22
> Found by: given as instruction pointer in context
> 1
> impalad!impala::HdfsParquetScanner::CollectSkippedPageRangesForSortedColumn(impala::MinMaxFilter
> const*, impala::ColumnType const&,
> std::vector<std::__cxx11::basic_string<char, std::char_traits<char>,
> std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char,
> std::char_traits<char>, std::allocator<char> > > > const&,
> std::vector<std::__cxx11::basic_string<char, std::char_traits<char>,
> std::allocator<char> >, std::allocator<std::__cxx11::basic_string<char,
> std::char_traits<char>, std::allocator<char> > > > const&, int, int,
> std::vector<impala::PageRange, std::allocator<impala::PageRange> >*)
> [hdfs-parquet-scanner.cc : 1388 + 0x3]
> rbp = 0x00007f6156771650 rsp = 0x00007f6156771570
> rip = 0x0000000001b10305
> Found by: previous frame's frame pointer
> 2 impalad!impala::HdfsParquetScanner::SkipPagesBatch(parquet::RowGroup&,
> impala::ColumnStatsReader const&, parquet::ColumnIndex const&, int, int,
> impala::ColumnType const&, int, parquet::ColumnChunk const&,
> impala::MinMaxFilter const*, std::vector<impala::RowRange,
> std::allocator<impala::RowRange> >*, int*) [hdfs-parquet-scanner.cc : 1230 +
> 0x34]
> rbx = 0x00007f61567716f0 rbp = 0x00007f61567717e0
> rsp = 0x00007f6156771660 r12 = 0x00007f6156771710
> r13 = 0x00007f6156771950 r14 = 0x000000001a9678e8
> r15 = 0x00007f6156771920 rip = 0x0000000001b14838
> Found by: call frame info
> 3
> impalad!impala::HdfsParquetScanner::FindSkipRangesForPagesWithMinMaxFilters(std::vector<impala::RowRange,
> std::allocator<impala::RowRange> >*) [hdfs-parquet-scanner.cc : 1528 + 0x57]
> rbx = 0x000000000000004a rbp = 0x00007f6156771b10
> rsp = 0x00007f61567717f0 r12 = 0x000000002c195800
> r13 = 0x000000002aa115d0 r14 = 0x0000000000000001
> r15 = 0x0000000000000049 rip = 0x0000000001b1cf1a
> Found by: call frame info
> 4 impalad!impala::HdfsParquetScanner::EvaluatePageIndex()
> [hdfs-parquet-scanner.cc : 1600 + 0x19]
> rbx = 0x00007f6156771c30 rbp = 0x00007f6156771cf0
> rsp = 0x00007f6156771b20 r12 = 0x000000002c195800
> r13 = 0x00007f6156771de8 r14 = 0x00000000104528a0
> r15 = 0x00007f6156771df0 rip = 0x0000000001b1d9dd
> Found by: call frame info
> 5 impalad!impala::HdfsParquetScanner::ProcessPageIndex()
> [hdfs-parquet-scanner.cc : 1318 + 0xb]
> rbx = 0x000000002c195800 rbp = 0x00007f6156771d70
> rsp = 0x00007f6156771d00 r12 = 0x00007f6156771d10
> r13 = 0x00007f6156771de8 r14 = 0x00000000104528a0
> r15 = 0x00007f6156771df0 rip = 0x0000000001b1dd0b
> Found by: call frame info
> 6 impalad!impala::HdfsParquetScanner::NextRowGroup()
> [hdfs-parquet-scanner.cc : 934 + 0xf]
> rbx = 0x00000000318ce040 rbp = 0x00007f6156771e40
> rsp = 0x00007f6156771d80 r12 = 0x000000002c195800
> r13 = 0x00007f6156771de8 r14 = 0x00000000104528a0
> r15 = 0x00007f6156771df0 rip = 0x0000000001b1e1b4
> Found by: call frame info
> 7 impalad!impala::HdfsParquetScanner::GetNextInternal(impala::RowBatch*)
> [hdfs-parquet-scanner.cc : 504 + 0xb]
> rbx = 0x000000002c195800 rbp = 0x00007f6156771ec0
> rsp = 0x00007f6156771e50 r12 = 0x00000000c1ca4d00
> r13 = 0x00007f6156771e78 r14 = 0x00007f6156771e80
> r15 = 0xaaaaaaaaaaaaaaab rip = 0x0000000001b1ed5b
> Found by: call frame info
> 8 impalad!impala::HdfsScanNodeMt::GetNext(impala::RuntimeState*,
> impala::RowBatch*, bool*) [hdfs-scanner.h : 138 + 0x1d]
> rbx = 0x0000000012272a00 rbp = 0x00007f6156772070
> rsp = 0x00007f6156771ed0 r12 = 0x000000002c195800
> r13 = 0x0000000000000000 r14 = 0x00007f6156771f70
> r15 = 0x00007f6156771fd0 rip = 0x00000000017d6235
> Found by: call frame info
> 9 impalad!impala::BlockingJoinNode::GetFirstProbeRow(impala::RuntimeState*)
> [blocking-join-node.cc : 316 + 0x6]
> rbx = 0x000000000adba000 rbp = 0x00007f61567720c0
> rsp = 0x00007f6156772080 r12 = 0x00007f6156772088
> r13 = 0x000000000adba209 r14 = 0x00000000496b9680
> r15 = 0x00007f61567720e0 rip = 0x00000000018c2069
> Found by: call frame info
> 10 impalad!impala::PartitionedHashJoinNode::Open(impala::RuntimeState*)
> [partitioned-hash-join-node.cc : 215 + 0xe]
> rbx = 0x000000000adba000 rbp = 0x00007f6156772170
> rsp = 0x00007f61567720d0 r12 = 0x00007f61567720e0
> r13 = 0x00000000496b9680 r14 = 0x00007f6156772290
> r15 = 0x0000000042c22030 rip = 0x000000000186c68d
> Found by: call frame info
> 11
> impalad!impala::BlockingJoinNode::ProcessBuildInputAndOpenProbe(impala::RuntimeState*,
> impala::JoinBuilder*) [blocking-join-node.cc : 242 + 0x6]
> rbx = 0x000000000adbb400 rbp = 0x00007f6156772300
> rsp = 0x00007f6156772180 r12 = 0x00007f6156772290
> r13 = 0x00007f6156772320 r14 = 0x00000000496b9680
> r15 = 0x0000000010f1cf00 rip = 0x00000000018c33b7
> Found by: call frame info
> 12 impalad!impala::PartitionedHashJoinNode::Open(impala::RuntimeState*)
> [partitioned-hash-join-node.cc : 209 + 0x15]
> rbx = 0x000000000adbb400 rbp = 0x00007f61567723b0
> rsp = 0x00007f6156772310 r12 = 0x00007f6156772320
> r13 = 0x00000000496b9680 r14 = 0x00007f6156772440
> r15 = 0x0000000042c47660 rip = 0x000000000186c62d
> Found by: call frame info
> 13 impalad!impala::SortNode::Open(impala::RuntimeState*) [sort-node.cc : 123
> + 0x6]
> rbx = 0x00000000496b9b00 rbp = 0x00007f61567724e0
> rsp = 0x00007f61567723c0 r12 = 0x00007f6156772440
> r13 = 0x0000000042c46e90 r14 = 0x00007f6156772420
> r15 = 0x00000000496b9680 rip = 0x0000000001892002
> Found by: call frame info
> 14 impalad!impala::FragmentInstanceState::Open() [fragment-instance-state.cc
> : 426 + 0x11]
> rbx = 0x000000000b0863c0 rbp = 0x00007f61567726a0
> rsp = 0x00007f61567724f0 r12 = 0x00007f61567725b0
> r13 = 0x0000000010f1d2c0 r14 = 0x00007f6156772510
> r15 = 0x000000001196be00 rip = 0x000000000129bbe3
> Found by: call frame info
> 15 impalad!impala::FragmentInstanceState::Exec() [fragment-instance-state.cc
> : 95 + 0xf]
> rbx = 0x000000000b0863c0 rbp = 0x00007f6156772760
> rsp = 0x00007f61567726b0 r12 = 0x00007f61567727a8
> r13 = 0x000000001aabd330 r14 = 0x00007f61567726f0
> r15 = 0x000000001196be00 rip = 0x000000000129dabd
> Found by: call frame info
> 16 impalad!impala::QueryState::ExecFInstance(impala::FragmentInstanceState*)
> [query-state.cc : 955 + 0x19]
> rbx = 0x00007f61567727d0 rbp = 0x00007f6156772830
> rsp = 0x00007f6156772770 r12 = 0x000000000b0863c0
> r13 = 0x00007f61567727b0 r14 = 0x0000000004950770
> r15 = 0x000000001196be00 rip = 0x0000000001223f01
> Found by: call frame info
> 17 impalad!impala::Thread::SuperviseThread(std::__cxx11::basic_string<char,
> std::char_traits<char>, std::allocator<char> > const&,
> std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char>
> > const&, boost::function<void ()>, impala::ThreadDebugInfo const*,
> impala::Promise<long, (impala::PromiseMode)0>*) [function_template.hpp : 763
> + 0x7]
> rbx = 0x00007f61567728c0 rbp = 0x00007f6156772b30
> rsp = 0x00007f6156772840 r12 = 0x00007f61567728a0
> r13 = 0x0000000006dfe300 r14 = 0x00007f62b159c7d0
> r15 = 0x00007f615a377dd8 rip = 0x000000000171aeb2
> Found by: call frame info
> 18 impalad!boost::detail::thread_data<boost::_bi::bind_t<void, void
> (*)(std::__cxx11::basic_string<char, std::char_traits<char>,
> std::allocator<char> > const&, std::__cxx11::basic_string<char,
> std::char_traits<char>, std::allocator<char> > const&, boost::function<void
> ()>, impala::ThreadDebugInfo const*, impala::Promise<long,
> (impala::PromiseMode)0>*),
> boost::_bi::list5<boost::_bi::value<std::__cxx11::basic_string<char,
> std::char_traits<char>, std::allocator<char> > >,
> boost::_bi::value<std::__cxx11::basic_string<char, std::char_traits<char>,
> std::allocator<char> > >, boost::_bi::value<boost::function<void ()> >,
> boost::_bi::value<impala::ThreadDebugInfo*>,
> boost::_bi::value<impala::Promise<long, (impala::PromiseMode)0>*> > >
> >::run() [bind.hpp : 531 + 0xc]
> rbx = 0x0000000063492300 rbp = 0x00007f6156772b90
> rsp = 0x00007f6156772b40 r12 = 0x00007f6156772b40
> r13 = 0x000000000171abb0 r14 = 0x00007f615a3788c0
> r15 = 0x00007f615a377da0 rip = 0x000000000171c3ab
> Found by: call frame info
> 19 impalad!thread_proxy + 0xa1
> rbx = 0x0000000000000000 rbp = 0x0000000063492300
> rsp = 0x00007f6156772ba0 r12 = 0x0000000000000000
> r13 = 0x0000000018a976c0 r14 = 0x0000000000000000
> r15 = 0x00007f6156773700 rip = 0x0000000001fac9d1
> Found by: call frame info
> 20 libpthread-2.17.so + 0x7ea5
> rbx = 0x0000000000000000 rbp = 0x0000000000000000
> rsp = 0x00007f6156772be0 r12 = 0x0000000000000000
> r13 = 0x0000000000a01000 r14 = 0x0000000000000000
> r15 = 0x00007f6156773700 rip = 0x00007f62b1597ea5
> Found by: call frame info
> 21 libc-2.17.so + 0xfeb0d
> rsp = 0x00007f6156772c80 rip = 0x00007f62adfe0b0d
> Found by: stack scanning
> {code}
--
This message was sent by Atlassian Jira
(v8.20.10#820010)
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]