This is an automated email from the ASF dual-hosted git repository.
morningman pushed a commit to branch orc
in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
The following commit(s) were added to refs/heads/orc by this push:
new c25d17ab [Optimize] Optimize the performance of reading decimals.
(#111)
c25d17ab is described below
commit c25d17ab509a1289ed6276f6349a4992dbc968c1
Author: Qi Chen <[email protected]>
AuthorDate: Mon Aug 21 22:34:58 2023 +0800
[Optimize] Optimize the performance of reading decimals. (#111)
Optimize the performance of reading decimals.
MySQL [hive.tpch100_orc]> select count(l_extendedprice) from lineitem;
+------------------------+
| count(l_extendedprice) |
+------------------------+
| 600037902 |
+------------------------+
1 row in set (1.88 sec)
MySQL [hive.tpch100_orc]> select count(l_extendedprice) from lineitem;
+------------------------+
| count(l_extendedprice) |
+------------------------+
| 600037902 |
+------------------------+
1 row in set (1.36 sec)
---
c++/src/ColumnReader.cc | 57 +++++++++++++++++++++++++++++++++----------------
1 file changed, 39 insertions(+), 18 deletions(-)
diff --git a/c++/src/ColumnReader.cc b/c++/src/ColumnReader.cc
index 78005abc..525f62c4 100644
--- a/c++/src/ColumnReader.cc
+++ b/c++/src/ColumnReader.cc
@@ -1666,7 +1666,7 @@ namespace orc {
}
}
- void readInt64(int64_t& value, int32_t currentScale) {
+ void readInt64(int64_t& value) {
value = 0;
size_t offset = 0;
while (true) {
@@ -1679,6 +1679,9 @@ namespace orc {
}
}
value = unZigZag(static_cast<uint64_t>(value));
+ }
+
+ void scaleInt64(int64_t& value, int32_t currentScale) {
if (scale > currentScale && static_cast<uint64_t>(scale - currentScale)
<= MAX_PRECISION_64) {
value *= POWERS_OF_TEN[scale - currentScale];
} else if (scale < currentScale &&
@@ -1791,12 +1794,14 @@ namespace orc {
if (notNull) {
for (size_t i = 0; i < numValues; ++i) {
if (notNull[i]) {
- readInt64(values[i], static_cast<int32_t>(scaleBuffer[i]));
+ readInt64(values[i]);
+ scaleInt64(values[i], static_cast<int32_t>(scaleBuffer[i]));
}
}
} else {
for (size_t i = 0; i < numValues; ++i) {
- readInt64(values[i], static_cast<int32_t>(scaleBuffer[i]));
+ readInt64(values[i]);
+ scaleInt64(values[i], static_cast<int32_t>(scaleBuffer[i]));
}
}
}
@@ -1822,8 +1827,8 @@ namespace orc {
skipInternal(countNonNullRowsInRange(notNull, previousIdx, idx),
readPhase);
}
if (notNull[idx]) {
- readInt64(values[idx], static_cast<int32_t>(scaleBuffer[idx]));
- ;
+ readInt64(values[idx]);
+ scaleInt64(values[idx], static_cast<int32_t>(scaleBuffer[idx]));
}
previousIdx = idx + 1;
}
@@ -1834,7 +1839,8 @@ namespace orc {
if (idx - previousIdx > 0) {
skipInternal(idx - previousIdx, readPhase);
}
- readInt64(values[idx], static_cast<int32_t>(scaleBuffer[idx]));
+ readInt64(values[idx]);
+ scaleInt64(values[idx], static_cast<int32_t>(scaleBuffer[idx]));
previousIdx = idx + 1;
}
skipInternal(numValues - previousIdx, readPhase);
@@ -1879,7 +1885,7 @@ namespace orc {
const ReadPhase& readPhase, uint16_t* sel_rowid_idx, size_t
sel_size) override;
private:
- void readInt128(Int128& value, int32_t currentScale) {
+ void readInt128(Int128& value) {
value = 0;
Int128 work;
uint32_t offset = 0;
@@ -1895,7 +1901,6 @@ namespace orc {
}
}
unZigZagInt128(value);
- scaleInt128(value, static_cast<uint32_t>(scale),
static_cast<uint32_t>(currentScale));
}
void nextInternal(ColumnVectorBatch& rowBatch, uint64_t numValues, char*
notNull,
@@ -1939,12 +1944,15 @@ namespace orc {
if (notNull) {
for (size_t i = 0; i < numValues; ++i) {
if (notNull[i]) {
- readInt128(values[i], static_cast<int32_t>(scaleBuffer[i]));
+ readInt128(values[i]);
+ scaleInt128(values[i], static_cast<uint32_t>(scale),
+ static_cast<int32_t>(scaleBuffer[i]));
}
}
} else {
for (size_t i = 0; i < numValues; ++i) {
- readInt128(values[i], static_cast<int32_t>(scaleBuffer[i]));
+ readInt128(values[i]);
+ scaleInt128(values[i], static_cast<uint32_t>(scale),
static_cast<int32_t>(scaleBuffer[i]));
}
}
}
@@ -1971,7 +1979,9 @@ namespace orc {
skipInternal(countNonNullRowsInRange(notNull, previousIdx, idx),
readPhase);
}
if (notNull[idx]) {
- readInt128(values[idx], static_cast<int32_t>(scaleBuffer[idx]));
+ readInt128(values[idx]);
+ scaleInt128(values[idx], static_cast<uint32_t>(scale),
+ static_cast<int32_t>(scaleBuffer[idx]));
}
previousIdx = idx + 1;
}
@@ -1982,7 +1992,7 @@ namespace orc {
if (idx - previousIdx > 0) {
skipInternal(idx - previousIdx, readPhase);
}
- readInt128(values[idx], static_cast<int32_t>(scaleBuffer[idx]));
+ readInt128(values[idx]);
previousIdx = idx + 1;
}
skipInternal(numValues - previousIdx, readPhase);
@@ -2048,7 +2058,7 @@ namespace orc {
/**
* Read an Int128 from the stream and correct it to the desired scale.
*/
- bool readInt128(Int128& value, int32_t currentScale) {
+ bool readInt128(Int128& value) {
// -/+ 99999999999999999999999999999999999999
static const Int128 MIN_VALUE(-0x4b3b4ca85a86c47b, 0xf675ddc000000001);
static const Int128 MAX_VALUE(0x4b3b4ca85a86c47a, 0x098a223fffffffff);
@@ -2078,7 +2088,6 @@ namespace orc {
return result;
}
unZigZagInt128(value);
- scaleInt128(value, static_cast<uint32_t>(scale),
static_cast<uint32_t>(currentScale));
return value >= MIN_VALUE && value <= MAX_VALUE;
}
@@ -2134,7 +2143,7 @@ namespace orc {
if (notNull) {
for (size_t i = 0; i < numValues; ++i) {
if (notNull[i]) {
- if (!readInt128(values[i], static_cast<int32_t>(scaleBuffer[i]))) {
+ if (!readInt128(values[i])) {
if (throwOnOverflow) {
throw ParseError("Hive 0.11 decimal was more than 38 digits.");
} else {
@@ -2143,12 +2152,15 @@ namespace orc {
<< "replaced by NULL.\n";
notNull[i] = false;
}
+ } else {
+ scaleInt128(values[i], static_cast<uint32_t>(scale),
+ static_cast<int32_t>(scaleBuffer[i]));
}
}
}
} else {
for (size_t i = 0; i < numValues; ++i) {
- if (!readInt128(values[i], static_cast<int32_t>(scaleBuffer[i]))) {
+ if (!readInt128(values[i])) {
if (throwOnOverflow) {
throw ParseError("Hive 0.11 decimal was more than 38 digits.");
} else {
@@ -2158,6 +2170,9 @@ namespace orc {
batch.hasNulls = true;
batch.notNull[i] = false;
}
+ } else {
+ scaleInt128(values[i], static_cast<uint32_t>(scale),
+ static_cast<int32_t>(scaleBuffer[i]));
}
}
}
@@ -2187,7 +2202,7 @@ namespace orc {
skipInternal(countNonNullRowsInRange(notNull, previousIdx, idx),
readPhase);
}
if (notNull[idx]) {
- if (!readInt128(values[idx],
static_cast<int32_t>(scaleBuffer[idx]))) {
+ if (!readInt128(values[idx])) {
if (throwOnOverflow) {
throw ParseError("Hive 0.11 decimal was more than 38 digits.");
} else {
@@ -2196,6 +2211,9 @@ namespace orc {
<< "replaced by NULL.\n";
notNull[idx] = false;
}
+ } else {
+ scaleInt128(values[idx], static_cast<uint32_t>(scale),
+ static_cast<int32_t>(scaleBuffer[idx]));
}
}
previousIdx = idx + 1;
@@ -2207,7 +2225,7 @@ namespace orc {
if (idx - previousIdx > 0) {
skipInternal(idx - previousIdx, readPhase);
}
- if (!readInt128(values[idx], static_cast<int32_t>(scaleBuffer[idx]))) {
+ if (!readInt128(values[idx])) {
if (throwOnOverflow) {
throw ParseError("Hive 0.11 decimal was more than 38 digits.");
} else {
@@ -2217,6 +2235,9 @@ namespace orc {
batch.hasNulls = true;
batch.notNull[idx] = false;
}
+ } else {
+ scaleInt128(values[idx], static_cast<uint32_t>(scale),
+ static_cast<int32_t>(scaleBuffer[idx]));
}
previousIdx = idx + 1;
}
---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]