[
https://issues.apache.org/jira/browse/PARQUET-852?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16449832#comment-16449832
]
ASF GitHub Bot commented on PARQUET-852:
----------------------------------------
zivanfi closed pull request #467: Revert "PARQUET-852: Slowly ramp up sizes of
byte[] in ByteBasedBitPackingEncoder"
URL: https://github.com/apache/parquet-mr/pull/467
This is a PR merged from a forked repository.
As GitHub hides the original diff on merge, it is displayed below for
the sake of provenance:
As this is a foreign pull request (from a fork), the diff is supplied
below (as it won't show otherwise due to GitHub magic):
diff --git
a/parquet-encoding/src/main/java/org/apache/parquet/column/values/bitpacking/ByteBasedBitPackingEncoder.java
b/parquet-encoding/src/main/java/org/apache/parquet/column/values/bitpacking/ByteBasedBitPackingEncoder.java
index 0bc8b3023..cc23e8f87 100644
---
a/parquet-encoding/src/main/java/org/apache/parquet/column/values/bitpacking/ByteBasedBitPackingEncoder.java
+++
b/parquet-encoding/src/main/java/org/apache/parquet/column/values/bitpacking/ByteBasedBitPackingEncoder.java
@@ -1,4 +1,4 @@
-/*
+/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
@@ -6,9 +6,9 @@
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
- *
+ *
* http://www.apache.org/licenses/LICENSE-2.0
- *
+ *
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -39,14 +39,11 @@
private static final Logger LOG =
LoggerFactory.getLogger(ByteBasedBitPackingEncoder.class);
private static final int VALUES_WRITTEN_AT_A_TIME = 8;
- private static final int MAX_SLAB_SIZE_MULT = 64 * 1024;
- private static final int INITIAL_SLAB_SIZE_MULT = 1024;
private final int bitWidth;
private final BytePacker packer;
private final int[] input = new int[VALUES_WRITTEN_AT_A_TIME];
- private int slabSize;
- private long totalFullSlabSize;
+ private final int slabSize;
private int inputSize;
private byte[] packed;
private int packedPosition;
@@ -59,9 +56,8 @@
public ByteBasedBitPackingEncoder(int bitWidth, Packer packer) {
this.bitWidth = bitWidth;
this.inputSize = 0;
- this.totalFullSlabSize = 0;
// must be a multiple of bitWidth
- this.slabSize = (bitWidth == 0) ? 1 : (bitWidth * INITIAL_SLAB_SIZE_MULT);
+ this.slabSize = bitWidth * 64 * 1024;
initPackedSlab();
this.packer = packer.newBytePacker(bitWidth);
}
@@ -79,10 +75,6 @@ public void writeInt(int value) throws IOException {
pack();
if (packedPosition == slabSize) {
slabs.add(BytesInput.from(packed));
- totalFullSlabSize += slabSize;
- if (slabSize < bitWidth * MAX_SLAB_SIZE_MULT) {
- slabSize *= 2;
- }
initPackedSlab();
}
}
@@ -107,7 +99,7 @@ private void initPackedSlab() {
public BytesInput toBytes() throws IOException {
int packedByteLength = packedPosition +
BytesUtils.paddedByteCountFromBits(inputSize * bitWidth);
- LOG.debug("writing {} bytes", (totalFullSlabSize + packedByteLength));
+ LOG.debug("writing {} bytes", (slabs.size() * slabSize +
packedByteLength));
if (inputSize > 0) {
for (int i = inputSize; i < input.length; i++) {
input[i] = 0;
@@ -121,24 +113,18 @@ public BytesInput toBytes() throws IOException {
* @return size of the data as it would be written
*/
public long getBufferSize() {
- return BytesUtils.paddedByteCountFromBits((totalValues + inputSize) *
bitWidth);
+ return BytesUtils.paddedByteCountFromBits(totalValues * bitWidth);
}
/**
* @return total memory allocated
*/
public long getAllocatedSize() {
- return totalFullSlabSize + packed.length + input.length * 4;
+ return (slabs.size() * slabSize) + packed.length + input.length * 4;
}
public String memUsageString(String prefix) {
return String.format("%s ByteBitPacking %d slabs, %d bytes", prefix,
slabs.size(), getAllocatedSize());
}
- /**
- * @return number of full slabs along with the current slab (debug aid)
- */
- int getNumSlabs() {
- return slabs.size() + 1;
- }
}
diff --git
a/parquet-encoding/src/test/java/org/apache/parquet/column/values/bitpacking/TestByteBasedBitPackingEncoder.java
b/parquet-encoding/src/test/java/org/apache/parquet/column/values/bitpacking/TestByteBasedBitPackingEncoder.java
index b49595b43..293b961f0 100644
---
a/parquet-encoding/src/test/java/org/apache/parquet/column/values/bitpacking/TestByteBasedBitPackingEncoder.java
+++
b/parquet-encoding/src/test/java/org/apache/parquet/column/values/bitpacking/TestByteBasedBitPackingEncoder.java
@@ -1,4 +1,4 @@
-/*
+/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
@@ -6,9 +6,9 @@
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
- *
+ *
* http://www.apache.org/licenses/LICENSE-2.0
- *
+ *
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
@@ -18,28 +18,22 @@
*/
package org.apache.parquet.column.values.bitpacking;
-import org.apache.parquet.bytes.BytesUtils;
import org.junit.Test;
-import static org.junit.Assert.assertEquals;
-
public class TestByteBasedBitPackingEncoder {
@Test
public void testSlabBoundary() {
- for (int i = 0; i <= 32; i++) {
+ for (int i = 0; i < 32; i++) {
final ByteBasedBitPackingEncoder encoder = new
ByteBasedBitPackingEncoder(i, Packer.BIG_ENDIAN);
- // make sure to write through the progression of slabs
- final int totalValues = 191 * 1024 * 8 + 10;
- for (int j = 0; j < totalValues; j++) {
+ // make sure to write more than a slab
+ for (int j = 0; j < 64 * 1024 * 32 + 10; j++) {
try {
encoder.writeInt(j);
} catch (Exception e) {
throw new RuntimeException(i + ": error writing " + j, e);
}
}
- assertEquals(BytesUtils.paddedByteCountFromBits(totalValues * i),
encoder.getBufferSize());
- assertEquals(i == 0 ? 1 : 9, encoder.getNumSlabs());
}
}
----------------------------------------------------------------
This is an automated message from the Apache Git Service.
To respond to the message, please log on GitHub and use the
URL above to go to the specific comment.
For queries about this service, please contact Infrastructure at:
[email protected]
> Slowly ramp up sizes of byte[] in ByteBasedBitPackingEncoder
> ------------------------------------------------------------
>
> Key: PARQUET-852
> URL: https://issues.apache.org/jira/browse/PARQUET-852
> Project: Parquet
> Issue Type: Improvement
> Reporter: John Jenkins
> Priority: Minor
> Fix For: 1.10.0
>
>
> The current allocation policy for ByteBasedBitPackingEncoder is to allocate
> 64KB * #bits up-front. As similarly observed in [PARQUET-580], this can lead
> to significant memory overheads for high-fanout scenarios (many columns
> and/or open files, in my case using BooleanPlainValuesWriter).
> As done in [PARQUET-585], I'll follow up with a PR that starts with a smaller
> buffer and works its way up to a max.
--
This message was sent by Atlassian JIRA
(v7.6.3#76005)