Repository: parquet-mr Updated Branches: refs/heads/parquet-1.8.x 4297134dc -> d41fdf3ef
PARQUET-852: Slowly ramp up sizes of byte[] in ByteBasedBitPackingEncoder https://issues.apache.org/jira/browse/PARQUET-852 Author: John Jenkins <[email protected]> Closes #401 from JohnPJenkins/PARQUET-852 and squashes the following commits: 334acec [John Jenkins] PARQUET-852: Slowly ramp up sizes of byte[] in ByteBasedBitPackingEncoder Project: http://git-wip-us.apache.org/repos/asf/parquet-mr/repo Commit: http://git-wip-us.apache.org/repos/asf/parquet-mr/commit/d41fdf3e Tree: http://git-wip-us.apache.org/repos/asf/parquet-mr/tree/d41fdf3e Diff: http://git-wip-us.apache.org/repos/asf/parquet-mr/diff/d41fdf3e Branch: refs/heads/parquet-1.8.x Commit: d41fdf3efbe53dc266e0c27b0ee7ded60d5c3b81 Parents: 4297134 Author: John Jenkins <[email protected]> Authored: Fri May 12 15:09:56 2017 -0700 Committer: Julien Le Dem <[email protected]> Committed: Fri May 12 15:13:11 2017 -0700 ---------------------------------------------------------------------- .../bitpacking/ByteBasedBitPackingEncoder.java | 30 ++++++++++++++------ .../TestByteBasedBitPackingEncoder.java | 18 ++++++++---- 2 files changed, 34 insertions(+), 14 deletions(-) ---------------------------------------------------------------------- http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/d41fdf3e/parquet-encoding/src/main/java/org/apache/parquet/column/values/bitpacking/ByteBasedBitPackingEncoder.java ---------------------------------------------------------------------- diff --git a/parquet-encoding/src/main/java/org/apache/parquet/column/values/bitpacking/ByteBasedBitPackingEncoder.java b/parquet-encoding/src/main/java/org/apache/parquet/column/values/bitpacking/ByteBasedBitPackingEncoder.java index cc23e8f..0bc8b30 100644 --- a/parquet-encoding/src/main/java/org/apache/parquet/column/values/bitpacking/ByteBasedBitPackingEncoder.java +++ b/parquet-encoding/src/main/java/org/apache/parquet/column/values/bitpacking/ByteBasedBitPackingEncoder.java @@ -1,4 +1,4 @@ -/* +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -39,11 +39,14 @@ public class ByteBasedBitPackingEncoder { private static final Logger LOG = LoggerFactory.getLogger(ByteBasedBitPackingEncoder.class); private static final int VALUES_WRITTEN_AT_A_TIME = 8; + private static final int MAX_SLAB_SIZE_MULT = 64 * 1024; + private static final int INITIAL_SLAB_SIZE_MULT = 1024; private final int bitWidth; private final BytePacker packer; private final int[] input = new int[VALUES_WRITTEN_AT_A_TIME]; - private final int slabSize; + private int slabSize; + private long totalFullSlabSize; private int inputSize; private byte[] packed; private int packedPosition; @@ -56,8 +59,9 @@ public class ByteBasedBitPackingEncoder { public ByteBasedBitPackingEncoder(int bitWidth, Packer packer) { this.bitWidth = bitWidth; this.inputSize = 0; + this.totalFullSlabSize = 0; // must be a multiple of bitWidth - this.slabSize = bitWidth * 64 * 1024; + this.slabSize = (bitWidth == 0) ? 1 : (bitWidth * INITIAL_SLAB_SIZE_MULT); initPackedSlab(); this.packer = packer.newBytePacker(bitWidth); } @@ -75,6 +79,10 @@ public class ByteBasedBitPackingEncoder { pack(); if (packedPosition == slabSize) { slabs.add(BytesInput.from(packed)); + totalFullSlabSize += slabSize; + if (slabSize < bitWidth * MAX_SLAB_SIZE_MULT) { + slabSize *= 2; + } initPackedSlab(); } } @@ -99,7 +107,7 @@ public class ByteBasedBitPackingEncoder { public BytesInput toBytes() throws IOException { int packedByteLength = packedPosition + BytesUtils.paddedByteCountFromBits(inputSize * bitWidth); - LOG.debug("writing {} bytes", (slabs.size() * slabSize + packedByteLength)); + LOG.debug("writing {} bytes", (totalFullSlabSize + packedByteLength)); if (inputSize > 0) { for (int i = inputSize; i < input.length; i++) { input[i] = 0; @@ -113,18 +121,24 @@ public class ByteBasedBitPackingEncoder { * @return size of the data as it would be written */ public long getBufferSize() { - return BytesUtils.paddedByteCountFromBits(totalValues * bitWidth); + return BytesUtils.paddedByteCountFromBits((totalValues + inputSize) * bitWidth); } /** * @return total memory allocated */ public long getAllocatedSize() { - return (slabs.size() * slabSize) + packed.length + input.length * 4; + return totalFullSlabSize + packed.length + input.length * 4; } public String memUsageString(String prefix) { return String.format("%s ByteBitPacking %d slabs, %d bytes", prefix, slabs.size(), getAllocatedSize()); } + /** + * @return number of full slabs along with the current slab (debug aid) + */ + int getNumSlabs() { + return slabs.size() + 1; + } } http://git-wip-us.apache.org/repos/asf/parquet-mr/blob/d41fdf3e/parquet-encoding/src/test/java/org/apache/parquet/column/values/bitpacking/TestByteBasedBitPackingEncoder.java ---------------------------------------------------------------------- diff --git a/parquet-encoding/src/test/java/org/apache/parquet/column/values/bitpacking/TestByteBasedBitPackingEncoder.java b/parquet-encoding/src/test/java/org/apache/parquet/column/values/bitpacking/TestByteBasedBitPackingEncoder.java index 293b961..b49595b 100644 --- a/parquet-encoding/src/test/java/org/apache/parquet/column/values/bitpacking/TestByteBasedBitPackingEncoder.java +++ b/parquet-encoding/src/test/java/org/apache/parquet/column/values/bitpacking/TestByteBasedBitPackingEncoder.java @@ -1,4 +1,4 @@ -/* +/* * Licensed to the Apache Software Foundation (ASF) under one * or more contributor license agreements. See the NOTICE file * distributed with this work for additional information @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -18,22 +18,28 @@ */ package org.apache.parquet.column.values.bitpacking; +import org.apache.parquet.bytes.BytesUtils; import org.junit.Test; +import static org.junit.Assert.assertEquals; + public class TestByteBasedBitPackingEncoder { @Test public void testSlabBoundary() { - for (int i = 0; i < 32; i++) { + for (int i = 0; i <= 32; i++) { final ByteBasedBitPackingEncoder encoder = new ByteBasedBitPackingEncoder(i, Packer.BIG_ENDIAN); - // make sure to write more than a slab - for (int j = 0; j < 64 * 1024 * 32 + 10; j++) { + // make sure to write through the progression of slabs + final int totalValues = 191 * 1024 * 8 + 10; + for (int j = 0; j < totalValues; j++) { try { encoder.writeInt(j); } catch (Exception e) { throw new RuntimeException(i + ": error writing " + j, e); } } + assertEquals(BytesUtils.paddedByteCountFromBits(totalValues * i), encoder.getBufferSize()); + assertEquals(i == 0 ? 1 : 9, encoder.getNumSlabs()); } }
