[
https://issues.apache.org/jira/browse/ORC-1142?page=com.atlassian.jira.plugin.system.issuetabpanels:all-tabpanel
]
Quanlong Huang updated ORC-1142:
--------------------------------
Description:
BooleanRleDecoderImpl::next() takes some portion of the time when reading
nullable columns. Here is the histogram of running orc-scan on a tpcds
store_sales data file:
{code:java}
Samples: 7K of event 'cycles:ppp', Event count (approx.): 8731129738
Overhead Command Shared Object Symbol
31.71% orc-scan orc-scan [.] orc::Decimal64ColumnReader::next
7.44% orc-scan orc-scan [.]
orc::RleDecoderV2::copyDataFromBuffer
7.29% orc-scan orc-scan [.]
snappy::SnappyDecompressor::DecompressAllTags<snappy::SnappyArrayWriter>
6.19% orc-scan orc-scan [.] orc::BooleanRleDecoderImpl::next
5.54% orc-scan orc-scan [.] orc::RleDecoderV2::nextDirect
5.37% orc-scan orc-scan [.] orc::RleDecoderV2::nextShortRepeats
4.58% orc-scan orc-scan [.] snappy::(anonymous
namespace)::IncrementalCopy
3.66% orc-scan orc-scan [.] snappy::(anonymous
namespace)::UnalignedCopy64
3.43% orc-scan orc-scan [.] orc::RleDecoderV2::nextDelta
2.81% orc-scan orc-scan [.]
snappy::SnappyArrayWriter::AppendFromSelf
2.41% orc-scan orc-scan [.] orc::RleDecoderV2::next
2.22% orc-scan orc-scan [.] orc::RleDecoderV2::readByte
1.78% orc-scan orc-scan [.] snappy::(anonymous
namespace)::UnalignedCopy128
1.61% orc-scan libc-2.23.so [.] __memcpy_avx_unaligned
1.55% orc-scan orc-scan [.]
snappy::SnappyArrayWriter::TryFastAppend
1.44% orc-scan [kernel.kallsyms] [k] copy_user_enhanced_fast_string
1.30% orc-scan orc-scan [.] orc::ByteRleDecoderImpl::next
1.17% orc-scan orc-scan [.] orc::RleDecoderV2::unrolledUnpack24
1.09% orc-scan libc-2.23.so [.] __memset_avx2
1.02% orc-scan orc-scan [.] snappy::SnappyArrayWriter::Produced
0.93% orc-scan orc-scan [.] orc::RleDecoderV2::readLongs
0.81% orc-scan orc-scan [.] orc::RleDecoderV2::unrolledUnpack16
0.76% orc-scan orc-scan [.] orc::RleDecoderV2::unrolledUnpack32
0.64% orc-scan orc-scan [.] snappy::LittleEndian::Load32
0.37% orc-scan orc-scan [.] snappy::LittleEndian::ToHost32
0.27% orc-scan orc-scan [.] orc::ColumnReader::next
{code}
The time on BooleanRleDecoderImpl::next() is dominanted by this loop:
[https://github.com/apache/orc/blob/78849c99359cfb9f5c9da88b8e16d118154ed9da/c%2B%2B/src/ByteRLE.cc#L627-L631]
{code:cpp}
for(int64_t i=static_cast<int64_t>(numValues) - 1;
i >= static_cast<int64_t>(position); --i, --bitsLeft) {
uint64_t shiftPosn = (-bitsLeft) % 8;
data[i] = (data[position + (bitsLeft - 1) / 8] >> shiftPosn) & 0x1;
}
{code}
Disassembly of it:
{code:java}
orc::BooleanRleDecoderImpl::next
/home/quanlong/workspace/orc/build/tools/src/orc-scan
Percent│ pop %r14
...
8.67 │240:┌─→mov %r13,%rcx
8.47 │ │ sub $0x1,%r13
8.06 │ │ mov %r13,%rax
7.26 │ │ neg %rcx
8.06 │ │ shr $0x3,%rax
6.85 │ │ and $0x7,%ecx
5.85 │ │ movsbl (%rsi,%rax,1),%eax
19.96 │ │ sar %cl,%eax
8.06 │ │ and $0x1,%eax
7.66 │ │ mov %al,(%r15,%r13,1)
│ ├──cmp %rbx,%r13
9.68 │ └──jne 7d3240 <orc::BooleanRleDecoderImpl::next(char*, unsigned
long, 240
│ ↑ jmpq 7d316f <orc::BooleanRleDecoderImpl::next(char*, unsigned
long, 16f
│26b: mov %r14,%r13
│ xor %ebx,%ebx
│ ↑ jmpq 7d31c2 <orc::BooleanRleDecoderImpl::next(char*, unsigned
long, 1c2
{code}
We can unroll the loop to save some instructions. I have a WIP patch
([^bool-rle-opt.patch]) but I haven't optimized it further.
was:
BooleanRleDecoderImpl::next() takes some portion of the time when reading
nullable columns. Here is the histogram of running orc-scan on a tpcds
store_sales data file:
{code:java}
Samples: 7K of event 'cycles:ppp', Event count (approx.): 8731129738
Overhead Command Shared Object Symbol
31.71% orc-scan orc-scan [.] orc::Decimal64ColumnReader::next
7.44% orc-scan orc-scan [.]
orc::RleDecoderV2::copyDataFromBuffer
7.29% orc-scan orc-scan [.]
snappy::SnappyDecompressor::DecompressAllTags<snappy::SnappyArrayWriter>
6.19% orc-scan orc-scan [.] orc::BooleanRleDecoderImpl::next
5.54% orc-scan orc-scan [.] orc::RleDecoderV2::nextDirect
5.37% orc-scan orc-scan [.] orc::RleDecoderV2::nextShortRepeats
4.58% orc-scan orc-scan [.] snappy::(anonymous
namespace)::IncrementalCopy
3.66% orc-scan orc-scan [.] snappy::(anonymous
namespace)::UnalignedCopy64
3.43% orc-scan orc-scan [.] orc::RleDecoderV2::nextDelta
2.81% orc-scan orc-scan [.]
snappy::SnappyArrayWriter::AppendFromSelf
2.41% orc-scan orc-scan [.] orc::RleDecoderV2::next
2.22% orc-scan orc-scan [.] orc::RleDecoderV2::readByte
1.78% orc-scan orc-scan [.] snappy::(anonymous
namespace)::UnalignedCopy128
1.61% orc-scan libc-2.23.so [.] __memcpy_avx_unaligned
1.55% orc-scan orc-scan [.]
snappy::SnappyArrayWriter::TryFastAppend
1.44% orc-scan [kernel.kallsyms] [k] copy_user_enhanced_fast_string
1.30% orc-scan orc-scan [.] orc::ByteRleDecoderImpl::next
1.17% orc-scan orc-scan [.] orc::RleDecoderV2::unrolledUnpack24
1.09% orc-scan libc-2.23.so [.] __memset_avx2
1.02% orc-scan orc-scan [.] snappy::SnappyArrayWriter::Produced
0.93% orc-scan orc-scan [.] orc::RleDecoderV2::readLongs
0.81% orc-scan orc-scan [.] orc::RleDecoderV2::unrolledUnpack16
0.76% orc-scan orc-scan [.] orc::RleDecoderV2::unrolledUnpack32
0.64% orc-scan orc-scan [.] snappy::LittleEndian::Load32
0.37% orc-scan orc-scan [.] snappy::LittleEndian::ToHost32
0.27% orc-scan orc-scan [.] orc::ColumnReader::next
{code}
The time on BooleanRleDecoderImpl::next() is dominanted by this loop:
https://github.com/apache/orc/blob/78849c99359cfb9f5c9da88b8e16d118154ed9da/c%2B%2B/src/ByteRLE.cc#L627-L631
{code:cpp}
for(int64_t i=static_cast<int64_t>(numValues) - 1;
i >= static_cast<int64_t>(position); --i, --bitsLeft) {
uint64_t shiftPosn = (-bitsLeft) % 8;
data[i] = (data[position + (bitsLeft - 1) / 8] >> shiftPosn) & 0x1;
}
{code}
Disassembly of it:
{code:java}
orc::BooleanRleDecoderImpl::next
/home/quanlong/workspace/orc/build/tools/src/orc-scan
Percent│ pop %r14
...
8.67 │240:┌─→mov %r13,%rcx
8.47 │ │ sub $0x1,%r13
8.06 │ │ mov %r13,%rax
7.26 │ │ neg %rcx
8.06 │ │ shr $0x3,%rax
6.85 │ │ and $0x7,%ecx
5.85 │ │ movsbl (%rsi,%rax,1),%eax
19.96 │ │ sar %cl,%eax
8.06 │ │ and $0x1,%eax
7.66 │ │ mov %al,(%r15,%r13,1)
│ ├──cmp %rbx,%r13
9.68 │ └──jne 7d3240 <orc::BooleanRleDecoderImpl::next(char*, unsigned
long, 240
│ ↑ jmpq 7d316f <orc::BooleanRleDecoderImpl::next(char*, unsigned
long, 16f
│26b: mov %r14,%r13
│ xor %ebx,%ebx
│ ↑ jmpq 7d31c2 <orc::BooleanRleDecoderImpl::next(char*, unsigned
long, 1c2
{code}
We can unroll the loop to save some instructions. I have a WIP patch but I
haven't optimized it further.
> [C++] Unroll loops in BooleanRleDecoderImpl::next()
> ---------------------------------------------------
>
> Key: ORC-1142
> URL: https://issues.apache.org/jira/browse/ORC-1142
> Project: ORC
> Issue Type: Improvement
> Components: C++
> Reporter: Quanlong Huang
> Priority: Major
> Attachments: bool-rle-opt.patch
>
>
> BooleanRleDecoderImpl::next() takes some portion of the time when reading
> nullable columns. Here is the histogram of running orc-scan on a tpcds
> store_sales data file:
> {code:java}
> Samples: 7K of event 'cycles:ppp', Event count (approx.): 8731129738
> Overhead Command Shared Object Symbol
> 31.71% orc-scan orc-scan [.] orc::Decimal64ColumnReader::next
> 7.44% orc-scan orc-scan [.]
> orc::RleDecoderV2::copyDataFromBuffer
> 7.29% orc-scan orc-scan [.]
> snappy::SnappyDecompressor::DecompressAllTags<snappy::SnappyArrayWriter>
> 6.19% orc-scan orc-scan [.] orc::BooleanRleDecoderImpl::next
> 5.54% orc-scan orc-scan [.] orc::RleDecoderV2::nextDirect
> 5.37% orc-scan orc-scan [.]
> orc::RleDecoderV2::nextShortRepeats
> 4.58% orc-scan orc-scan [.] snappy::(anonymous
> namespace)::IncrementalCopy
> 3.66% orc-scan orc-scan [.] snappy::(anonymous
> namespace)::UnalignedCopy64
> 3.43% orc-scan orc-scan [.] orc::RleDecoderV2::nextDelta
> 2.81% orc-scan orc-scan [.]
> snappy::SnappyArrayWriter::AppendFromSelf
> 2.41% orc-scan orc-scan [.] orc::RleDecoderV2::next
> 2.22% orc-scan orc-scan [.] orc::RleDecoderV2::readByte
> 1.78% orc-scan orc-scan [.] snappy::(anonymous
> namespace)::UnalignedCopy128
> 1.61% orc-scan libc-2.23.so [.] __memcpy_avx_unaligned
> 1.55% orc-scan orc-scan [.]
> snappy::SnappyArrayWriter::TryFastAppend
> 1.44% orc-scan [kernel.kallsyms] [k] copy_user_enhanced_fast_string
> 1.30% orc-scan orc-scan [.] orc::ByteRleDecoderImpl::next
> 1.17% orc-scan orc-scan [.]
> orc::RleDecoderV2::unrolledUnpack24
> 1.09% orc-scan libc-2.23.so [.] __memset_avx2
> 1.02% orc-scan orc-scan [.]
> snappy::SnappyArrayWriter::Produced
> 0.93% orc-scan orc-scan [.] orc::RleDecoderV2::readLongs
> 0.81% orc-scan orc-scan [.]
> orc::RleDecoderV2::unrolledUnpack16
> 0.76% orc-scan orc-scan [.]
> orc::RleDecoderV2::unrolledUnpack32
> 0.64% orc-scan orc-scan [.] snappy::LittleEndian::Load32
> 0.37% orc-scan orc-scan [.] snappy::LittleEndian::ToHost32
> 0.27% orc-scan orc-scan [.] orc::ColumnReader::next
> {code}
> The time on BooleanRleDecoderImpl::next() is dominanted by this loop:
> [https://github.com/apache/orc/blob/78849c99359cfb9f5c9da88b8e16d118154ed9da/c%2B%2B/src/ByteRLE.cc#L627-L631]
> {code:cpp}
> for(int64_t i=static_cast<int64_t>(numValues) - 1;
> i >= static_cast<int64_t>(position); --i, --bitsLeft) {
> uint64_t shiftPosn = (-bitsLeft) % 8;
> data[i] = (data[position + (bitsLeft - 1) / 8] >> shiftPosn) & 0x1;
> }
> {code}
> Disassembly of it:
> {code:java}
> orc::BooleanRleDecoderImpl::next
> /home/quanlong/workspace/orc/build/tools/src/orc-scan
> Percent│ pop %r14
> ...
> 8.67 │240:┌─→mov %r13,%rcx
> 8.47 │ │ sub $0x1,%r13
> 8.06 │ │ mov %r13,%rax
> 7.26 │ │ neg %rcx
> 8.06 │ │ shr $0x3,%rax
> 6.85 │ │ and $0x7,%ecx
> 5.85 │ │ movsbl (%rsi,%rax,1),%eax
> 19.96 │ │ sar %cl,%eax
> 8.06 │ │ and $0x1,%eax
> 7.66 │ │ mov %al,(%r15,%r13,1)
> │ ├──cmp %rbx,%r13
> 9.68 │ └──jne 7d3240 <orc::BooleanRleDecoderImpl::next(char*,
> unsigned long, 240
> │ ↑ jmpq 7d316f <orc::BooleanRleDecoderImpl::next(char*,
> unsigned long, 16f
> │26b: mov %r14,%r13
> │ xor %ebx,%ebx
> │ ↑ jmpq 7d31c2 <orc::BooleanRleDecoderImpl::next(char*,
> unsigned long, 1c2
> {code}
> We can unroll the loop to save some instructions. I have a WIP patch
> ([^bool-rle-opt.patch]) but I haven't optimized it further.
--
This message was sent by Atlassian Jira
(v8.20.1#820001)