This is an automated email from the ASF dual-hosted git repository.
dongjoon pushed a commit to branch branch-1.7
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/branch-1.7 by this push:
new 42d11e9 ORC-1041: Use `memcpy` during LZO decompression (#958)
42d11e9 is described below
commit 42d11e9b8de906db5450ff4142fc53f7fc73f3b4
Author: Yiqun Zhang <[email protected]>
AuthorDate: Thu Nov 4 23:50:22 2021 +0800
ORC-1041: Use `memcpy` during LZO decompression (#958)
### What changes were proposed in this pull request?
This pr is aimed to fix the implementation of copy data blocks during LZO
decompression.
`*reinterpret_cast< int64_t*>(output) = *reinterpret_cast<
int64_t*>(matchAddress);` can lead to unexpected behavior, and in failed test
cases it does not appear to be an atomic operation.
This pr uses memcpy instead of the above statement.
Here are the performance benchmarks, where `memcpy` is basically the same
as `reinterpret_cast` + `assignment`. The newer compilers outperform unfolded
assignment, so here is a screenshot of the results with only some of the
parameters, which you can reproduce with the following test code at
https://quick-bench.com/



```c++
#include <string.h>
static void use_memcpy(benchmark::State& state) {
auto size = state.range(0);
char buf[size];
for (int i = 0; i < 8; ++i) {
buf[i] = 'a';
}
for (auto _ : state) {
char *output = buf + 8;
char *matchAddress = buf;
char *matchOutputLimit = buf + size;
while (output < matchOutputLimit) {
memcpy(output, matchAddress, 8);
matchAddress += 8;
output += 8;
}
}
}
static void use_expanded_assignment(benchmark::State& state) {
auto size = state.range(0);
char buf[size];
for (int i = 0; i < 8; ++i) {
buf[i] = 'a';
}
for (auto _ : state) {
char *output = buf + 8;
char *matchAddress = buf;
char *matchOutputLimit = buf + size;
while (output < matchOutputLimit) {
output[0] = *matchOutputLimit;
output[1] = *(matchOutputLimit + 1);
output[2] = *(matchOutputLimit + 2);
output[3] = *(matchOutputLimit + 3);
output[4] = *(matchOutputLimit + 4);
output[5] = *(matchOutputLimit + 5);
output[6] = *(matchOutputLimit + 6);
output[7] = *(matchOutputLimit + 7);
matchAddress += 8;
output += 8;
}
}
}
static void use_reinterpret_assignment(benchmark::State& state) {
auto size = state.range(0);
char buf[size];
for (int i = 0; i < 8; ++i) {
buf[i] = 'a';
}
for (auto _ : state) {
char *output = buf + 8;
char *matchAddress = buf;
char *matchOutputLimit = buf + size;
while (output < matchOutputLimit) {
*reinterpret_cast<int64_t*>(output) =
*reinterpret_cast<int64_t*>(matchAddress);
matchAddress += 8;
output += 8;
}
}
}
BENCHMARK(use_memcpy)->Arg(100000);
BENCHMARK(use_expanded_assignment)->Arg(100000);
BENCHMARK(use_reinterpret_assignment)->Arg(100000);
```
### Why are the changes needed?
Fix the bug of LZO decompression.
### How was this patch tested?
Pass the CIs.
(cherry picked from commit 502661aee2b1895ba4ff652297bf3a5f4ea8955a)
Signed-off-by: Dongjoon Hyun <[email protected]>
---
c++/src/LzoDecompressor.cc | 15 +++++----------
1 file changed, 5 insertions(+), 10 deletions(-)
diff --git a/c++/src/LzoDecompressor.cc b/c++/src/LzoDecompressor.cc
index d1ba183..21bf194 100644
--- a/c++/src/LzoDecompressor.cc
+++ b/c++/src/LzoDecompressor.cc
@@ -312,13 +312,11 @@ namespace orc {
output += SIZE_OF_INT;
matchAddress += increment32;
- *reinterpret_cast<int32_t*>(output) =
- *reinterpret_cast<int32_t*>(matchAddress);
+ memcpy(output, matchAddress, SIZE_OF_INT);
output += SIZE_OF_INT;
matchAddress -= decrement64;
} else {
- *reinterpret_cast<int64_t*>(output) =
- *reinterpret_cast<int64_t*>(matchAddress);
+ memcpy(output, matchAddress, SIZE_OF_LONG);
matchAddress += SIZE_OF_LONG;
output += SIZE_OF_LONG;
}
@@ -329,8 +327,7 @@ namespace orc {
}
while (output < fastOutputLimit) {
- *reinterpret_cast<int64_t*>(output) =
- *reinterpret_cast<int64_t*>(matchAddress);
+ memcpy(output, matchAddress, SIZE_OF_LONG);
matchAddress += SIZE_OF_LONG;
output += SIZE_OF_LONG;
}
@@ -340,8 +337,7 @@ namespace orc {
}
} else {
while (output < matchOutputLimit) {
- *reinterpret_cast<int64_t*>(output) =
- *reinterpret_cast<int64_t*>(matchAddress);
+ memcpy(output, matchAddress, SIZE_OF_LONG);
matchAddress += SIZE_OF_LONG;
output += SIZE_OF_LONG;
}
@@ -366,8 +362,7 @@ namespace orc {
// fast copy. We may over-copy but there's enough room in input
// and output to not overrun them
do {
- *reinterpret_cast<int64_t*>(output) =
- *reinterpret_cast<const int64_t*>(input);
+ memcpy(output, input, SIZE_OF_LONG);
input += SIZE_OF_LONG;
output += SIZE_OF_LONG;
} while (output < literalOutputLimit);