This is an automated email from the ASF dual-hosted git repository.
pgaref pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/orc.git
The following commit(s) were added to refs/heads/main by this push:
new fb0ed31 ORC-829: Optimize percentileBits (#734)
fb0ed31 is described below
commit fb0ed31f15d9b77b76bc5dec8721f5d9ee129879
Author: belugabehr <[email protected]>
AuthorDate: Wed Jul 7 11:32:16 2021 -0400
ORC-829: Optimize percentileBits (#734)
### What changes were proposed in this pull request?
Optimize Serialization percentileBits method
### Why are the changes needed?
Speed and simplicity
### How was this patch tested?
Existing unit tests, no functionality changes.
---
.../org/apache/orc/impl/SerializationUtils.java | 87 +++++++++++++---------
1 file changed, 51 insertions(+), 36 deletions(-)
diff --git a/java/core/src/java/org/apache/orc/impl/SerializationUtils.java
b/java/core/src/java/org/apache/orc/impl/SerializationUtils.java
index 522e20c..460fdc2 100644
--- a/java/core/src/java/org/apache/orc/impl/SerializationUtils.java
+++ b/java/core/src/java/org/apache/orc/impl/SerializationUtils.java
@@ -34,6 +34,7 @@ import java.io.OutputStream;
import java.math.BigInteger;
import java.nio.charset.StandardCharsets;
import java.sql.Date;
+import java.util.Arrays;
import java.util.TimeZone;
public final class SerializationUtils {
@@ -43,6 +44,15 @@ public final class SerializationUtils {
private final byte[] readBuffer;
private final byte[] writeBuffer;
+ /**
+ * Buffer for histogram that store the encoded bit requirement for each bit
+ * size. Maximum number of discrete bits that can encoded is ordinal of
+ * FixedBitSizes.
+ *
+ * @see FixedBitSizes
+ */
+ private final int[] histBuffer = new int[32];
+
public SerializationUtils() {
this.readBuffer = new byte[BUFFER_SIZE];
this.writeBuffer = new byte[BUFFER_SIZE];
@@ -255,12 +265,8 @@ public final class SerializationUtils {
* @return bits required to store value
*/
public int findClosestNumBits(long value) {
- int count = 0;
- while (value != 0) {
- count++;
- value = value >>> 1;
- }
- return getClosestFixedBits(count);
+ final int numBits = 64 - Long.numberOfLeadingZeros(value);
+ return getClosestFixedBits(numBits);
}
/**
@@ -287,27 +293,24 @@ public final class SerializationUtils {
* @param p - percentile value (>=0.0 to <=1.0)
* @return pth percentile bits
*/
- public int percentileBits(long[] data, int offset, int length,
- double p) {
+ public int percentileBits(long[] data, int offset, int length, double p) {
if ((p > 1.0) || (p <= 0.0)) {
return -1;
}
- // histogram that store the encoded bit requirement for each values.
- // maximum number of bits that can encoded is 32 (refer FixedBitSizes)
- int[] hist = new int[32];
+ Arrays.fill(this.histBuffer, 0);
// compute the histogram
for(int i = offset; i < (offset + length); i++) {
int idx = encodeBitWidth(findClosestNumBits(data[i]));
- hist[idx] += 1;
+ this.histBuffer[idx] += 1;
}
int perLen = (int) (length * (1.0 - p));
// return the bits required by pth percentile length
- for(int i = hist.length - 1; i >= 0; i--) {
- perLen -= hist[i];
+ for(int i = this.histBuffer.length - 1; i >= 0; i--) {
+ perLen -= this.histBuffer[i];
if (perLen < 0) {
return decodeBitWidth(i);
}
@@ -352,26 +355,31 @@ public final class SerializationUtils {
if (n == 0) {
return 1;
}
-
- if (n >= 1 && n <= 24) {
+ if (n <= 24) {
return n;
- } else if (n > 24 && n <= 26) {
+ }
+ if (n <= 26) {
return 26;
- } else if (n > 26 && n <= 28) {
+ }
+ if (n <= 28) {
return 28;
- } else if (n > 28 && n <= 30) {
+ }
+ if (n <= 30) {
return 30;
- } else if (n > 30 && n <= 32) {
+ }
+ if (n <= 32) {
return 32;
- } else if (n > 32 && n <= 40) {
+ }
+ if (n <= 40) {
return 40;
- } else if (n > 40 && n <= 48) {
+ }
+ if (n <= 48) {
return 48;
- } else if (n > 48 && n <= 56) {
+ }
+ if (n <= 56) {
return 56;
- } else {
- return 64;
}
+ return 64;
}
public int getClosestAlignedFixedBits(int n) {
@@ -402,8 +410,9 @@ public final class SerializationUtils {
/**
* Finds the closest available fixed bit width match and returns its encoded
- * value (ordinal)
- * @param n - fixed bit width to encode
+ * value (ordinal).
+ *
+ * @param n fixed bit width to encode
* @return encoded fixed bit width
*/
public int encodeBitWidth(int n) {
@@ -411,23 +420,29 @@ public final class SerializationUtils {
if (n >= 1 && n <= 24) {
return n - 1;
- } else if (n > 24 && n <= 26) {
+ }
+ if (n <= 26) {
return FixedBitSizes.TWENTYSIX.ordinal();
- } else if (n > 26 && n <= 28) {
+ }
+ if (n <= 28) {
return FixedBitSizes.TWENTYEIGHT.ordinal();
- } else if (n > 28 && n <= 30) {
+ }
+ if (n <= 30) {
return FixedBitSizes.THIRTY.ordinal();
- } else if (n > 30 && n <= 32) {
+ }
+ if (n <= 32) {
return FixedBitSizes.THIRTYTWO.ordinal();
- } else if (n > 32 && n <= 40) {
+ }
+ if (n <= 40) {
return FixedBitSizes.FORTY.ordinal();
- } else if (n > 40 && n <= 48) {
+ }
+ if (n <= 48) {
return FixedBitSizes.FORTYEIGHT.ordinal();
- } else if (n > 48 && n <= 56) {
+ }
+ if (n <= 56) {
return FixedBitSizes.FIFTYSIX.ordinal();
- } else {
- return FixedBitSizes.SIXTYFOUR.ordinal();
}
+ return FixedBitSizes.SIXTYFOUR.ordinal();
}
/**