[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386914#comment-16386914 ] ASF GitHub Bot commented on ARROW-2199: --- siddharthteotia closed pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646 This is a PR merged from a forked repository. As GitHub hides the original diff on merge, it is displayed below for the sake of provenance: As this is a foreign pull request (from a fork), the diff is supplied below (as it won't show otherwise due to GitHub magic): diff --git a/java/memory/src/main/java/org/apache/arrow/memory/BaseAllocator.java b/java/memory/src/main/java/org/apache/arrow/memory/BaseAllocator.java index 5411baf7b..2f70f7372 100644 --- a/java/memory/src/main/java/org/apache/arrow/memory/BaseAllocator.java +++ b/java/memory/src/main/java/org/apache/arrow/memory/BaseAllocator.java @@ -134,6 +134,9 @@ private static String createErrorMsg(final BufferAllocator allocator, final int * @return The closest power of two of that value. */ public static int nextPowerOfTwo(int val) { +if (val == 0 || val == 1) { + return val + 1; +} int highestBit = Integer.highestOneBit(val); if (highestBit == val) { return val; @@ -149,6 +152,9 @@ public static int nextPowerOfTwo(int val) { * @return The closest power of two of that value. */ public static long nextPowerOfTwo(long val) { +if (val == 0 || val == 1) { + return val + 1; +} long highestBit = Long.highestOneBit(val); if (highestBit == val) { return val; diff --git a/java/vector/src/main/codegen/templates/UnionVector.java b/java/vector/src/main/codegen/templates/UnionVector.java index 84450bee5..1cfa0666a 100644 --- a/java/vector/src/main/codegen/templates/UnionVector.java +++ b/java/vector/src/main/codegen/templates/UnionVector.java @@ -282,6 +282,7 @@ private void reallocTypeBuffer() { long newAllocationSize = baseSize * 2L; newAllocationSize = BaseAllocator.nextPowerOfTwo(newAllocationSize); +assert newAllocationSize >= 1; if (newAllocationSize > BaseValueVector.MAX_ALLOCATION_SIZE) { throw new OversizedAllocationException("Unable to expand the buffer"); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java index cbc56fe3d..4b47df8a4 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseFixedWidthVector.java @@ -444,6 +444,7 @@ private ArrowBuf reallocBufferHelper(ArrowBuf buffer, final boolean dataBuffer) long newAllocationSize = baseSize * 2L; newAllocationSize = BaseAllocator.nextPowerOfTwo(newAllocationSize); +assert newAllocationSize >= 1; if (newAllocationSize > MAX_ALLOCATION_SIZE) { throw new OversizedAllocationException("Unable to expand the buffer"); diff --git a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java index c32d20f18..ecb3c780e 100644 --- a/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java +++ b/java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java @@ -174,14 +174,14 @@ public void setInitialCapacity(int valueCount) { * @param valueCount desired number of elements in the vector * @param density average number of bytes per variable width element */ + @Override public void setInitialCapacity(int valueCount, double density) { -final long size = (long) (valueCount * density); -if (size < 1) { - throw new IllegalArgumentException("With the provided density and value count, potential capacity of the data buffer is 0"); -} +long size = Math.max((long)(valueCount * density), 1L); + if (size > MAX_ALLOCATION_SIZE) { throw new OversizedAllocationException("Requested amount of memory is more than max allowed"); } + valueAllocationSizeInBytes = (int) size; validityAllocationSizeInBytes = getValidityBufferSizeFromCount(valueCount); /* to track the end offset of last data element in vector, we need @@ -489,6 +489,7 @@ public void reallocDataBuffer() { long newAllocationSize = baseSize * 2L; newAllocationSize = BaseAllocator.nextPowerOfTwo(newAllocationSize); +assert newAllocationSize >= 1; if (newAllocationSize > MAX_ALLOCATION_SIZE) { throw new OversizedAllocationException("Unable to expand the buffer"); @@ -541,6 +542,7 @@ private ArrowBuf reallocBufferHelper(ArrowBuf buffer, final boolean offsetBuffer long newAllocationSize = baseSize * 2L; newAllocationSize = BaseAllocator.nextPowe
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386913#comment-16386913 ] ASF GitHub Bot commented on ARROW-2199: --- siddharthteotia commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172358637 ## File path: java/vector/src/main/java/org/apache/arrow/vector/DensityAwareVector.java ## @@ -0,0 +1,57 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +/** + * Vector that support density aware initial capacity settings. + * We use this for ListVector and VarCharVector as of now to + * control the memory allocated. + * + * For ListVector, we have been using a multiplier of 5 + * to compute the initial capacity of the inner data vector. + * For deeply nested lists and lists with lots of NULL values, + * this is over-allocation upfront. So density helps to be + * conservative when computing the value capacity of the + * inner vector. + * + * For example, a density value of 10 implies each position in the + * list vector has a list of 10 values. So we will provision + * an initial capacity of (valuecount * 10) for the inner vector. + * A density value of 0.1 implies out of 10 positions in the list vector, + * 1 position has a list of size 1 and remaining positions are + * null (no lists) or empty lists. This helps in tightly controlling + * the memory we provision for inner data vector. + * + * Similar analogy is applicable for VarCharVector where the capacity + * of the data buffer can be controlled using density multiplier + * instead of default multiplier of 8 (default size of average + * varchar length). + * + * Also from container vectors, we propagate the density down + * the the inner vectors so that they can use it appropriately. + */ +public interface DensityAwareVector { Review comment: I agree. This is not the only interface implemented on its own (without subclassing ValueVector). We have NullableVectorDefinitionSetter that provides the method setIndexDefined(index) This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386900#comment-16386900 ] ASF GitHub Bot commented on ARROW-2199: --- icexelloss commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172356587 ## File path: java/vector/src/main/java/org/apache/arrow/vector/DensityAwareVector.java ## @@ -0,0 +1,57 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +/** + * Vector that support density aware initial capacity settings. + * We use this for ListVector and VarCharVector as of now to + * control the memory allocated. + * + * For ListVector, we have been using a multiplier of 5 + * to compute the initial capacity of the inner data vector. + * For deeply nested lists and lists with lots of NULL values, + * this is over-allocation upfront. So density helps to be + * conservative when computing the value capacity of the + * inner vector. + * + * For example, a density value of 10 implies each position in the + * list vector has a list of 10 values. So we will provision + * an initial capacity of (valuecount * 10) for the inner vector. + * A density value of 0.1 implies out of 10 positions in the list vector, + * 1 position has a list of size 1 and remaining positions are + * null (no lists) or empty lists. This helps in tightly controlling + * the memory we provision for inner data vector. + * + * Similar analogy is applicable for VarCharVector where the capacity + * of the data buffer can be controlled using density multiplier + * instead of default multiplier of 8 (default size of average + * varchar length). + * + * Also from container vectors, we propagate the density down + * the the inner vectors so that they can use it appropriately. + */ +public interface DensityAwareVector { Review comment: Ok. My feedback is while the current implementation is simple, the interface doesn't feel very well designed - if the interface is called "DensityAwareVector", I would expect it to have "Vector" like behavior, rather than just a single function. I prefer well designed interfaces, but I am ok with address this later as I don't see this being a blocker for this PR. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386885#comment-16386885 ] ASF GitHub Bot commented on ARROW-2199: --- siddharthteotia commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172354537 ## File path: java/vector/src/main/java/org/apache/arrow/vector/DensityAwareVector.java ## @@ -0,0 +1,57 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +/** + * Vector that support density aware initial capacity settings. + * We use this for ListVector and VarCharVector as of now to + * control the memory allocated. + * + * For ListVector, we have been using a multiplier of 5 + * to compute the initial capacity of the inner data vector. + * For deeply nested lists and lists with lots of NULL values, + * this is over-allocation upfront. So density helps to be + * conservative when computing the value capacity of the + * inner vector. + * + * For example, a density value of 10 implies each position in the + * list vector has a list of 10 values. So we will provision + * an initial capacity of (valuecount * 10) for the inner vector. + * A density value of 0.1 implies out of 10 positions in the list vector, + * 1 position has a list of size 1 and remaining positions are + * null (no lists) or empty lists. This helps in tightly controlling + * the memory we provision for inner data vector. + * + * Similar analogy is applicable for VarCharVector where the capacity + * of the data buffer can be controlled using density multiplier + * instead of default multiplier of 8 (default size of average + * varchar length). + * + * Also from container vectors, we propagate the density down + * the the inner vectors so that they can use it appropriately. + */ +public interface DensityAwareVector { Review comment: I would like to refrain from changing the vector hierarchy at this point for this small change. A standalone interface does the job. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386882#comment-16386882 ] ASF GitHub Bot commented on ARROW-2199: --- icexelloss commented on issue #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#issuecomment-370593461 @siddharthteotia WDYT on https://github.com/apache/arrow/pull/1646#discussion_r172282525 This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386875#comment-16386875 ] ASF GitHub Bot commented on ARROW-2199: --- icexelloss commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172352552 ## File path: java/vector/src/main/java/org/apache/arrow/vector/DensityAwareVector.java ## @@ -0,0 +1,57 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +/** + * Vector that support density aware initial capacity settings. + * We use this for ListVector and VarCharVector as of now to + * control the memory allocated. + * + * For ListVector, we have been using a multiplier of 5 + * to compute the initial capacity of the inner data vector. + * For deeply nested lists and lists with lots of NULL values, + * this is over-allocation upfront. So density helps to be + * conservative when computing the value capacity of the + * inner vector. + * + * For example, a density value of 10 implies each position in the + * list vector has a list of 10 values. So we will provision + * an initial capacity of (valuecount * 10) for the inner vector. + * A density value of 0.1 implies out of 10 positions in the list vector, + * 1 position has a list of size 1 and remaining positions are Review comment: Ok. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386856#comment-16386856 ] ASF GitHub Bot commented on ARROW-2199: --- siddharthteotia commented on issue #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#issuecomment-370588667 Can we get a closure on this one? This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386713#comment-16386713 ] ASF GitHub Bot commented on ARROW-2199: --- siddharthteotia commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172318466 ## File path: java/vector/src/main/java/org/apache/arrow/vector/DensityAwareVector.java ## @@ -0,0 +1,57 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +/** + * Vector that support density aware initial capacity settings. + * We use this for ListVector and VarCharVector as of now to + * control the memory allocated. + * + * For ListVector, we have been using a multiplier of 5 + * to compute the initial capacity of the inner data vector. + * For deeply nested lists and lists with lots of NULL values, + * this is over-allocation upfront. So density helps to be + * conservative when computing the value capacity of the + * inner vector. + * + * For example, a density value of 10 implies each position in the + * list vector has a list of 10 values. So we will provision + * an initial capacity of (valuecount * 10) for the inner vector. + * A density value of 0.1 implies out of 10 positions in the list vector, + * 1 position has a list of size 1 and remaining positions are Review comment: Which is why saying average list size implies the right meaning. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386712#comment-16386712 ] ASF GitHub Bot commented on ARROW-2199: --- siddharthteotia commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172318347 ## File path: java/vector/src/main/java/org/apache/arrow/vector/DensityAwareVector.java ## @@ -0,0 +1,57 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +/** + * Vector that support density aware initial capacity settings. + * We use this for ListVector and VarCharVector as of now to + * control the memory allocated. + * + * For ListVector, we have been using a multiplier of 5 + * to compute the initial capacity of the inner data vector. + * For deeply nested lists and lists with lots of NULL values, + * this is over-allocation upfront. So density helps to be + * conservative when computing the value capacity of the + * inner vector. + * + * For example, a density value of 10 implies each position in the + * list vector has a list of 10 values. So we will provision + * an initial capacity of (valuecount * 10) for the inner vector. + * A density value of 0.1 implies out of 10 positions in the list vector, + * 1 position has a list of size 1 and remaining positions are Review comment: I think you are trying to generalize the meaning of density whereas the list vector could be nested too. We propagate density down the tree. So here we just talk about the immediate inner vector. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386694#comment-16386694 ] ASF GitHub Bot commented on ARROW-2199: --- icexelloss commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172316594 ## File path: java/vector/src/main/java/org/apache/arrow/vector/DensityAwareVector.java ## @@ -0,0 +1,57 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +/** + * Vector that support density aware initial capacity settings. + * We use this for ListVector and VarCharVector as of now to + * control the memory allocated. + * + * For ListVector, we have been using a multiplier of 5 + * to compute the initial capacity of the inner data vector. + * For deeply nested lists and lists with lots of NULL values, + * this is over-allocation upfront. So density helps to be + * conservative when computing the value capacity of the + * inner vector. + * + * For example, a density value of 10 implies each position in the + * list vector has a list of 10 values. So we will provision + * an initial capacity of (valuecount * 10) for the inner vector. + * A density value of 0.1 implies out of 10 positions in the list vector, + * 1 position has a list of size 1 and remaining positions are Review comment: I see. Thanks for the explanation. > Density is the average size of list per position in the List vector This is fine. > density value of 10 implies each position in the list vector has a list of 10 values. If I understand correctly, a density value of 10 can be either: * 10 sub list of 10 values * 1 sub list 100 values, 9 null sublists * ... As long as the average size of sub lists equals density. Is that correct? If so, can we make it clear in the doc? This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386684#comment-16386684 ] ASF GitHub Bot commented on ARROW-2199: --- siddharthteotia commented on issue #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#issuecomment-370549664 @BryanCutler , @icexelloss , the latest commit addresses the comments w.r.t realloc and nextPowerOfTwo. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386647#comment-16386647 ] ASF GitHub Bot commented on ARROW-2199: --- siddharthteotia commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172307303 ## File path: java/vector/src/main/java/org/apache/arrow/vector/DensityAwareVector.java ## @@ -0,0 +1,57 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +/** + * Vector that support density aware initial capacity settings. + * We use this for ListVector and VarCharVector as of now to + * control the memory allocated. + * + * For ListVector, we have been using a multiplier of 5 + * to compute the initial capacity of the inner data vector. + * For deeply nested lists and lists with lots of NULL values, + * this is over-allocation upfront. So density helps to be + * conservative when computing the value capacity of the + * inner vector. + * + * For example, a density value of 10 implies each position in the + * list vector has a list of 10 values. So we will provision + * an initial capacity of (valuecount * 10) for the inner vector. + * A density value of 0.1 implies out of 10 positions in the list vector, + * 1 position has a list of size 1 and remaining positions are Review comment: Density is the average size of list per position in the List vector as mentioned in the doc. For your example, density is 1. I don't think it is a good idea to generalize the purpose or usage of density. The main purpose of density was to be conservative about the value capacity provisioned for inner vectors. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386634#comment-16386634 ] ASF GitHub Bot commented on ARROW-2199: --- icexelloss commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172304362 ## File path: java/vector/src/main/java/org/apache/arrow/vector/DensityAwareVector.java ## @@ -0,0 +1,57 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +/** + * Vector that support density aware initial capacity settings. + * We use this for ListVector and VarCharVector as of now to + * control the memory allocated. + * + * For ListVector, we have been using a multiplier of 5 + * to compute the initial capacity of the inner data vector. + * For deeply nested lists and lists with lots of NULL values, + * this is over-allocation upfront. So density helps to be + * conservative when computing the value capacity of the + * inner vector. + * + * For example, a density value of 10 implies each position in the + * list vector has a list of 10 values. So we will provision + * an initial capacity of (valuecount * 10) for the inner vector. + * A density value of 0.1 implies out of 10 positions in the list vector, + * 1 position has a list of size 1 and remaining positions are Review comment: What if I have a vector such that 1 out of 10 positions has a list of size 10 and remaining positions are null, what would the density be? This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386632#comment-16386632 ] ASF GitHub Bot commented on ARROW-2199: --- siddharthteotia commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172304189 ## File path: java/vector/src/main/codegen/templates/UnionVector.java ## @@ -282,6 +282,7 @@ private void reallocTypeBuffer() { long newAllocationSize = baseSize * 2L; newAllocationSize = BaseAllocator.nextPowerOfTwo(newAllocationSize); +newAllocationSize = Math.max(newAllocationSize, 1); Review comment: Done. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386620#comment-16386620 ] ASF GitHub Bot commented on ARROW-2199: --- siddharthteotia commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172302307 ## File path: java/vector/src/main/java/org/apache/arrow/vector/DensityAwareVector.java ## @@ -0,0 +1,57 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +/** + * Vector that support density aware initial capacity settings. + * We use this for ListVector and VarCharVector as of now to + * control the memory allocated. + * + * For ListVector, we have been using a multiplier of 5 + * to compute the initial capacity of the inner data vector. + * For deeply nested lists and lists with lots of NULL values, + * this is over-allocation upfront. So density helps to be + * conservative when computing the value capacity of the + * inner vector. + * + * For example, a density value of 10 implies each position in the + * list vector has a list of 10 values. So we will provision + * an initial capacity of (valuecount * 10) for the inner vector. + * A density value of 0.1 implies out of 10 positions in the list vector, + * 1 position has a list of size 1 and remaining positions are Review comment: The doc doesn't mix that. It clearly indicates the purpose of density: For example, a *density value of 10 implies each position in the list *vector has a list of 10 values. *A density value of 0.1 implies out of 10 positions in *the list vector, 1 position has a list of size 1 and *remaining positions are null (no lists) or empty lists. *This helps in tightly controlling the memory we provision *for inner data vector. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386614#comment-16386614 ] ASF GitHub Bot commented on ARROW-2199: --- icexelloss commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172300509 ## File path: java/vector/src/main/java/org/apache/arrow/vector/DensityAwareVector.java ## @@ -0,0 +1,57 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +/** + * Vector that support density aware initial capacity settings. + * We use this for ListVector and VarCharVector as of now to + * control the memory allocated. + * + * For ListVector, we have been using a multiplier of 5 + * to compute the initial capacity of the inner data vector. + * For deeply nested lists and lists with lots of NULL values, + * this is over-allocation upfront. So density helps to be + * conservative when computing the value capacity of the + * inner vector. + * + * For example, a density value of 10 implies each position in the + * list vector has a list of 10 values. So we will provision + * an initial capacity of (valuecount * 10) for the inner vector. + * A density value of 0.1 implies out of 10 positions in the list vector, + * 1 position has a list of size 1 and remaining positions are Review comment: Sounds like for a list of 10 values. These two have the same density == 1: * 10 sub lists of size 1 * 1 sub list of size 10, 9 sub list of null Is that correct understanding? The doc seems to mix the two cases so it's not very clear to me. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386605#comment-16386605 ] ASF GitHub Bot commented on ARROW-2199: --- icexelloss commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172300509 ## File path: java/vector/src/main/java/org/apache/arrow/vector/DensityAwareVector.java ## @@ -0,0 +1,57 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +/** + * Vector that support density aware initial capacity settings. + * We use this for ListVector and VarCharVector as of now to + * control the memory allocated. + * + * For ListVector, we have been using a multiplier of 5 + * to compute the initial capacity of the inner data vector. + * For deeply nested lists and lists with lots of NULL values, + * this is over-allocation upfront. So density helps to be + * conservative when computing the value capacity of the + * inner vector. + * + * For example, a density value of 10 implies each position in the + * list vector has a list of 10 values. So we will provision + * an initial capacity of (valuecount * 10) for the inner vector. + * A density value of 0.1 implies out of 10 positions in the list vector, + * 1 position has a list of size 1 and remaining positions are Review comment: Sounds like for a list of 10 values. These two have the same density == 1: * 10 sub lists of size 1 * 1 sub list of size 10, 9 sub list of null Is that correct understanding? The doc seems to fix the two cases so it's not very clear to me. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386601#comment-16386601 ] ASF GitHub Bot commented on ARROW-2199: --- siddharthteotia commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r17229 ## File path: java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java ## @@ -811,14 +811,9 @@ public void testSetInitialCapacity() { assertEquals(512, vector.getValueCapacity()); assertEquals(8, vector.getDataVector().getValueCapacity()); - boolean error = false; - try { -vector.setInitialCapacity(5, 0.1); - } catch (IllegalArgumentException e) { -error = true; - } finally { -assertTrue(error); - } + vector.setInitialCapacity(5, 0.1); + vector.allocateNew(); + assertEquals(7, vector.getValueCapacity()); Review comment: Done. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386594#comment-16386594 ] ASF GitHub Bot commented on ARROW-2199: --- BryanCutler commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172299493 ## File path: java/vector/src/main/codegen/templates/UnionVector.java ## @@ -282,6 +282,7 @@ private void reallocTypeBuffer() { long newAllocationSize = baseSize * 2L; newAllocationSize = BaseAllocator.nextPowerOfTwo(newAllocationSize); +newAllocationSize = Math.max(newAllocationSize, 1); Review comment: It's just that this is really safeguarding a function that is returning incorrect results, and it's in quite a few places. Maybe this PR is not the right place to fix it, so how about I open another JIRA, put a quick patch and then you can bring that in here? This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386592#comment-16386592 ] ASF GitHub Bot commented on ARROW-2199: --- siddharthteotia commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172299311 ## File path: java/vector/src/main/java/org/apache/arrow/vector/DensityAwareVector.java ## @@ -0,0 +1,57 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +/** + * Vector that support density aware initial capacity settings. + * We use this for ListVector and VarCharVector as of now to + * control the memory allocated. + * + * For ListVector, we have been using a multiplier of 5 + * to compute the initial capacity of the inner data vector. + * For deeply nested lists and lists with lots of NULL values, + * this is over-allocation upfront. So density helps to be + * conservative when computing the value capacity of the + * inner vector. + * + * For example, a density value of 10 implies each position in the + * list vector has a list of 10 values. So we will provision + * an initial capacity of (valuecount * 10) for the inner vector. + * A density value of 0.1 implies out of 10 positions in the list vector, + * 1 position has a list of size 1 and remaining positions are Review comment: valueCount * density is used for computing the value capacity of the inner vector. IF the List vector has a valuecount of 10, we use the density to compute the target value count for the inner vector and < 1 value of density helps in provisioning keeping NULL values into account. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386547#comment-16386547 ] ASF GitHub Bot commented on ARROW-2199: --- icexelloss commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172291183 ## File path: java/vector/src/main/java/org/apache/arrow/vector/complex/NonNullableStructVector.java ## @@ -99,6 +99,17 @@ public void setInitialCapacity(int numRecords) { } } + @Override + public void setInitialCapacity(int valueCount, double density) { +for (final ValueVector vector : (Iterable) this) { + if (vector instanceof DensityAwareVector) { Review comment: Ok, SGTM. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386581#comment-16386581 ] ASF GitHub Bot commented on ARROW-2199: --- icexelloss commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172297486 ## File path: java/vector/src/main/codegen/templates/UnionVector.java ## @@ -282,6 +282,7 @@ private void reallocTypeBuffer() { long newAllocationSize = baseSize * 2L; newAllocationSize = BaseAllocator.nextPowerOfTwo(newAllocationSize); +newAllocationSize = Math.max(newAllocationSize, 1); Review comment: I think there are probably too many such checks `newAllocationSize = Math.max(newAllocationSize, 1);` across the code, safeguarding realloc is fine but the way it's currently implemented feels too scattered and error prone. (Some one could forget to add this check in some vectors in the future, for instance) This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386573#comment-16386573 ] ASF GitHub Bot commented on ARROW-2199: --- icexelloss commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172295709 ## File path: java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java ## @@ -174,14 +174,14 @@ public void setInitialCapacity(int valueCount) { * @param valueCount desired number of elements in the vector * @param density average number of bytes per variable width element */ + @Override public void setInitialCapacity(int valueCount, double density) { -final long size = (long) (valueCount * density); -if (size < 1) { - throw new IllegalArgumentException("With the provided density and value count, potential capacity of the data buffer is 0"); -} +long size = Math.max((long)(valueCount * density), 1L); Review comment: Ok. I don't have strong opinion on the behavior for the case `valueCount * density < 1`, I guess what you are saying make sense. In that case, can we make that clear in the documentation (maybe at the interface level)? Also, this is consistent with `setInitialCapacity(valueCount)`? i.e Does `setInitialCapacity(0)` also adjust to 1 as well? This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386569#comment-16386569 ] ASF GitHub Bot commented on ARROW-2199: --- siddharthteotia commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172295366 ## File path: java/vector/src/main/codegen/templates/UnionVector.java ## @@ -282,6 +282,7 @@ private void reallocTypeBuffer() { long newAllocationSize = baseSize * 2L; newAllocationSize = BaseAllocator.nextPowerOfTwo(newAllocationSize); +newAllocationSize = Math.max(newAllocationSize, 1); Review comment: I just don't feel strong need for it. safeguarding realloc seems perfectly fine to me. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386558#comment-16386558 ] ASF GitHub Bot commented on ARROW-2199: --- icexelloss commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172293201 ## File path: java/vector/src/main/codegen/templates/UnionVector.java ## @@ -282,6 +282,7 @@ private void reallocTypeBuffer() { long newAllocationSize = baseSize * 2L; newAllocationSize = BaseAllocator.nextPowerOfTwo(newAllocationSize); +newAllocationSize = Math.max(newAllocationSize, 1); Review comment: If we want to be safe, I think we can create `nextPowerOfTwoZeroSafe` to not affect the existing users of `nextPowerOfTwo`? This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386549#comment-16386549 ] ASF GitHub Bot commented on ARROW-2199: --- BryanCutler commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172291035 ## File path: java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java ## @@ -811,14 +811,9 @@ public void testSetInitialCapacity() { assertEquals(512, vector.getValueCapacity()); assertEquals(8, vector.getDataVector().getValueCapacity()); - boolean error = false; - try { -vector.setInitialCapacity(5, 0.1); - } catch (IllegalArgumentException e) { -error = true; - } finally { -assertTrue(error); - } + vector.setInitialCapacity(5, 0.1); + vector.allocateNew(); + assertEquals(7, vector.getValueCapacity()); Review comment: how about adding `assertEquals(1, vector.getDataVector().getValueCapacity())` and also maybe a brief explanation of the values? This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386532#comment-16386532 ] ASF GitHub Bot commented on ARROW-2199: --- siddharthteotia commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172289379 ## File path: java/vector/src/main/codegen/templates/UnionVector.java ## @@ -282,6 +282,7 @@ private void reallocTypeBuffer() { long newAllocationSize = baseSize * 2L; newAllocationSize = BaseAllocator.nextPowerOfTwo(newAllocationSize); +newAllocationSize = Math.max(newAllocationSize, 1); Review comment: Should that function be fixed as part of this patch? I am not even sure what are the implications of that in other parts of Arrow and/or downstream consumers. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386515#comment-16386515 ] ASF GitHub Bot commented on ARROW-2199: --- BryanCutler commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172286775 ## File path: java/vector/src/main/java/org/apache/arrow/vector/complex/NonNullableStructVector.java ## @@ -99,6 +99,17 @@ public void setInitialCapacity(int numRecords) { } } + @Override + public void setInitialCapacity(int valueCount, double density) { +for (final ValueVector vector : (Iterable) this) { + if (vector instanceof DensityAwareVector) { Review comment: I agree, I think it's best to just check the class instance where needed instead of delegating This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386511#comment-16386511 ] ASF GitHub Bot commented on ARROW-2199: --- icexelloss commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172286159 ## File path: java/vector/src/main/codegen/templates/UnionVector.java ## @@ -282,6 +282,7 @@ private void reallocTypeBuffer() { long newAllocationSize = baseSize * 2L; newAllocationSize = BaseAllocator.nextPowerOfTwo(newAllocationSize); +newAllocationSize = Math.max(newAllocationSize, 1); Review comment: I agree with Bryan. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386504#comment-16386504 ] ASF GitHub Bot commented on ARROW-2199: --- icexelloss commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172285488 ## File path: java/vector/src/main/java/org/apache/arrow/vector/DensityAwareVector.java ## @@ -0,0 +1,57 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +/** + * Vector that support density aware initial capacity settings. + * We use this for ListVector and VarCharVector as of now to + * control the memory allocated. + * + * For ListVector, we have been using a multiplier of 5 + * to compute the initial capacity of the inner data vector. + * For deeply nested lists and lists with lots of NULL values, + * this is over-allocation upfront. So density helps to be + * conservative when computing the value capacity of the + * inner vector. + * + * For example, a density value of 10 implies each position in the + * list vector has a list of 10 values. So we will provision + * an initial capacity of (valuecount * 10) for the inner vector. + * A density value of 0.1 implies out of 10 positions in the list vector, + * 1 position has a list of size 1 and remaining positions are Review comment: Yeah I think it makes sense that density of 0.1 means 10% of the value is non-null. What I am not sure about is why the size of the non-null value has size 1 ? It seems `valuecount * density` is used for both (1) number of non-null sub lists in the parent list (2) (average) length of the non-null sub lists. What if these two values are different? This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386492#comment-16386492 ] ASF GitHub Bot commented on ARROW-2199: --- siddharthteotia commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172282743 ## File path: java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java ## @@ -174,14 +174,14 @@ public void setInitialCapacity(int valueCount) { * @param valueCount desired number of elements in the vector * @param density average number of bytes per variable width element */ + @Override public void setInitialCapacity(int valueCount, double density) { -final long size = (long) (valueCount * density); -if (size < 1) { - throw new IllegalArgumentException("With the provided density and value count, potential capacity of the data buffer is 0"); -} +long size = Math.max((long)(valueCount * density), 1L); Review comment: Because it is better to internally set the initial capacity to 1 as opposed to throwing exception. In our code, we invoke this in a loop dynamically adjusting the density value and stepping down the initial capacity because we are working with fix memory reservation and limits. So setInitialCapacity() followed by allocateNew() might fail in the second step if not enough memory. So we restart by adjusting the density and stepping down the value count. Throwing an exception doesn't help in any case. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386499#comment-16386499 ] ASF GitHub Bot commented on ARROW-2199: --- BryanCutler commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r17228 ## File path: java/vector/src/main/codegen/templates/UnionVector.java ## @@ -282,6 +282,7 @@ private void reallocTypeBuffer() { long newAllocationSize = baseSize * 2L; newAllocationSize = BaseAllocator.nextPowerOfTwo(newAllocationSize); +newAllocationSize = Math.max(newAllocationSize, 1); Review comment: Well, technically given 0 the next power of 2 should be 1. So I think that function needs the fix and then the extra check here should be safe to remove. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386495#comment-16386495 ] ASF GitHub Bot commented on ARROW-2199: --- siddharthteotia commented on issue #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#issuecomment-370516903 @BryanCutler , @icexelloss , I have addressed and responded to review comments. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386494#comment-16386494 ] ASF GitHub Bot commented on ARROW-2199: --- siddharthteotia commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172283361 ## File path: java/vector/src/main/java/org/apache/arrow/vector/DensityAwareVector.java ## @@ -0,0 +1,57 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +/** + * Vector that support density aware initial capacity settings. + * We use this for ListVector and VarCharVector as of now to + * control the memory allocated. + * + * For ListVector, we have been using a multiplier of 5 + * to compute the initial capacity of the inner data vector. + * For deeply nested lists and lists with lots of NULL values, + * this is over-allocation upfront. So density helps to be + * conservative when computing the value capacity of the + * inner vector. + * + * For example, a density value of 10 implies each position in the + * list vector has a list of 10 values. So we will provision + * an initial capacity of (valuecount * 10) for the inner vector. + * A density value of 0.1 implies out of 10 positions in the list vector, + * 1 position has a list of size 1 and remaining positions are Review comment: 10 * 0.1 is the initial capacity we will provision for the inner vector in the mentioned example. So only 1 value in the inner vector of the list for 10 outer positions in the list vector. Does that make sense? Think about a mix of null and non-null lists. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386493#comment-16386493 ] ASF GitHub Bot commented on ARROW-2199: --- icexelloss commented on issue #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#issuecomment-370516083 I left some comments for the change. Sorry for the delay. (I forgot about this after I came back from vacation) This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386491#comment-16386491 ] ASF GitHub Bot commented on ARROW-2199: --- icexelloss commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172282525 ## File path: java/vector/src/main/java/org/apache/arrow/vector/DensityAwareVector.java ## @@ -0,0 +1,57 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +/** + * Vector that support density aware initial capacity settings. + * We use this for ListVector and VarCharVector as of now to + * control the memory allocated. + * + * For ListVector, we have been using a multiplier of 5 + * to compute the initial capacity of the inner data vector. + * For deeply nested lists and lists with lots of NULL values, + * this is over-allocation upfront. So density helps to be + * conservative when computing the value capacity of the + * inner vector. + * + * For example, a density value of 10 implies each position in the + * list vector has a list of 10 values. So we will provision + * an initial capacity of (valuecount * 10) for the inner vector. + * A density value of 0.1 implies out of 10 positions in the list vector, + * 1 position has a list of size 1 and remaining positions are + * null (no lists) or empty lists. This helps in tightly controlling + * the memory we provision for inner data vector. + * + * Similar analogy is applicable for VarCharVector where the capacity + * of the data buffer can be controlled using density multiplier + * instead of default multiplier of 8 (default size of average + * varchar length). + * + * Also from container vectors, we propagate the density down + * the the inner vectors so that they can use it appropriately. + */ +public interface DensityAwareVector { Review comment: Should this be a sub interface of `ValueVector`? It feels a bit strange this interface is called `Vector` but doesn't define any vector methods. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386490#comment-16386490 ] ASF GitHub Bot commented on ARROW-2199: --- icexelloss commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172282525 ## File path: java/vector/src/main/java/org/apache/arrow/vector/DensityAwareVector.java ## @@ -0,0 +1,57 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +/** + * Vector that support density aware initial capacity settings. + * We use this for ListVector and VarCharVector as of now to + * control the memory allocated. + * + * For ListVector, we have been using a multiplier of 5 + * to compute the initial capacity of the inner data vector. + * For deeply nested lists and lists with lots of NULL values, + * this is over-allocation upfront. So density helps to be + * conservative when computing the value capacity of the + * inner vector. + * + * For example, a density value of 10 implies each position in the + * list vector has a list of 10 values. So we will provision + * an initial capacity of (valuecount * 10) for the inner vector. + * A density value of 0.1 implies out of 10 positions in the list vector, + * 1 position has a list of size 1 and remaining positions are + * null (no lists) or empty lists. This helps in tightly controlling + * the memory we provision for inner data vector. + * + * Similar analogy is applicable for VarCharVector where the capacity + * of the data buffer can be controlled using density multiplier + * instead of default multiplier of 8 (default size of average + * varchar length). + * + * Also from container vectors, we propagate the density down + * the the inner vectors so that they can use it appropriately. + */ +public interface DensityAwareVector { Review comment: Should this be a sub interface of `ValueVector`? It feels a bit strange this interface is called `Vector` but only has one method. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386486#comment-16386486 ] ASF GitHub Bot commented on ARROW-2199: --- icexelloss commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172281889 ## File path: java/vector/src/main/java/org/apache/arrow/vector/DensityAwareVector.java ## @@ -0,0 +1,57 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +/** + * Vector that support density aware initial capacity settings. + * We use this for ListVector and VarCharVector as of now to + * control the memory allocated. + * + * For ListVector, we have been using a multiplier of 5 + * to compute the initial capacity of the inner data vector. + * For deeply nested lists and lists with lots of NULL values, + * this is over-allocation upfront. So density helps to be + * conservative when computing the value capacity of the + * inner vector. + * + * For example, a density value of 10 implies each position in the + * list vector has a list of 10 values. So we will provision + * an initial capacity of (valuecount * 10) for the inner vector. + * A density value of 0.1 implies out of 10 positions in the list vector, + * 1 position has a list of size 1 and remaining positions are Review comment: Sorry I don't understand this completely. Why is the 1 position that is not null has size == 1? This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386487#comment-16386487 ] ASF GitHub Bot commented on ARROW-2199: --- siddharthteotia commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172281991 ## File path: java/vector/src/main/java/org/apache/arrow/vector/complex/NonNullableStructVector.java ## @@ -99,6 +99,17 @@ public void setInitialCapacity(int numRecords) { } } + @Override + public void setInitialCapacity(int valueCount, double density) { +for (final ValueVector vector : (Iterable) this) { + if (vector instanceof DensityAwareVector) { Review comment: I think that's unnecessary delegation and then there is no purpose of having a DensityAwareVector interface since typically everyone will implement the interface -- some will delegate and rest will implement. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386483#comment-16386483 ] ASF GitHub Bot commented on ARROW-2199: --- siddharthteotia commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172281445 ## File path: java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java ## @@ -1933,15 +1933,6 @@ public void testSetInitialCapacity() { vector.allocateNew(); assertEquals(4096, vector.getValueCapacity()); assertEquals(64, vector.getDataBuffer().capacity()); - - boolean error = false; - try { -vector.setInitialCapacity(5, 0.1); Review comment: added back the test with assertion This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386484#comment-16386484 ] ASF GitHub Bot commented on ARROW-2199: --- siddharthteotia commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172281516 ## File path: java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java ## @@ -810,15 +810,6 @@ public void testSetInitialCapacity() { vector.allocateNew(); assertEquals(512, vector.getValueCapacity()); assertEquals(8, vector.getDataVector().getValueCapacity()); - - boolean error = false; - try { -vector.setInitialCapacity(5, 0.1); Review comment: you are right. added back the test with assertion. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386479#comment-16386479 ] ASF GitHub Bot commented on ARROW-2199: --- icexelloss commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172281168 ## File path: java/vector/src/main/java/org/apache/arrow/vector/complex/NonNullableStructVector.java ## @@ -99,6 +99,17 @@ public void setInitialCapacity(int numRecords) { } } + @Override + public void setInitialCapacity(int valueCount, double density) { +for (final ValueVector vector : (Iterable) this) { + if (vector instanceof DensityAwareVector) { Review comment: What do you think of having all vectors implement `setInitialCapacity(valueCount, density)` and delegate to `setInitialCapacity(valueCount)` instead of case matching here? This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386480#comment-16386480 ] ASF GitHub Bot commented on ARROW-2199: --- icexelloss commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172281168 ## File path: java/vector/src/main/java/org/apache/arrow/vector/complex/NonNullableStructVector.java ## @@ -99,6 +99,17 @@ public void setInitialCapacity(int numRecords) { } } + @Override + public void setInitialCapacity(int valueCount, double density) { +for (final ValueVector vector : (Iterable) this) { + if (vector instanceof DensityAwareVector) { Review comment: What do you think of having non density aware vectors implement `setInitialCapacity(valueCount, density)` and delegate to `setInitialCapacity(valueCount)` instead of case matching here? This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386481#comment-16386481 ] ASF GitHub Bot commented on ARROW-2199: --- siddharthteotia commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172281369 ## File path: java/vector/src/main/codegen/templates/UnionVector.java ## @@ -282,6 +282,7 @@ private void reallocTypeBuffer() { long newAllocationSize = baseSize * 2L; newAllocationSize = BaseAllocator.nextPowerOfTwo(newAllocationSize); +newAllocationSize = Math.max(newAllocationSize, 1); Review comment: The only thing we want to ensure is that if realloc is starting with existing capacity as 0, the caller should not run into an infinite loop and that's why we do max and set it to 1 if needed. Adding an assertion sounds fine but that implicitly asks for removing the setting to math.max (blah, 1). I am suggesting to keep the max setting but I don't have a strong opinion This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386472#comment-16386472 ] ASF GitHub Bot commented on ARROW-2199: --- icexelloss commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172280459 ## File path: java/vector/src/main/codegen/templates/UnionVector.java ## @@ -282,6 +282,7 @@ private void reallocTypeBuffer() { long newAllocationSize = baseSize * 2L; newAllocationSize = BaseAllocator.nextPowerOfTwo(newAllocationSize); +newAllocationSize = Math.max(newAllocationSize, 1); Review comment: Also, I think there are too many such statements. Can we put it in `nextPowerOfTwo` or maybe create a new method `nextPowerofTwoZeroSafe`? This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386474#comment-16386474 ] ASF GitHub Bot commented on ARROW-2199: --- icexelloss commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172280459 ## File path: java/vector/src/main/codegen/templates/UnionVector.java ## @@ -282,6 +282,7 @@ private void reallocTypeBuffer() { long newAllocationSize = baseSize * 2L; newAllocationSize = BaseAllocator.nextPowerOfTwo(newAllocationSize); +newAllocationSize = Math.max(newAllocationSize, 1); Review comment: Also, I think there are too many such statements. Can we put it in `nextPowerOfTwo` or maybe create a new method `nextPowerOfTwoZeroSafe`? This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386464#comment-16386464 ] ASF GitHub Bot commented on ARROW-2199: --- icexelloss commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172279715 ## File path: java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java ## @@ -174,14 +174,14 @@ public void setInitialCapacity(int valueCount) { * @param valueCount desired number of elements in the vector * @param density average number of bytes per variable width element */ + @Override public void setInitialCapacity(int valueCount, double density) { -final long size = (long) (valueCount * density); -if (size < 1) { - throw new IllegalArgumentException("With the provided density and value count, potential capacity of the data buffer is 0"); -} +long size = Math.max((long)(valueCount * density), 1L); Review comment: I am not 100% sure about this. Why isn't this a illegal argument exception like before? This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386461#comment-16386461 ] ASF GitHub Bot commented on ARROW-2199: --- icexelloss commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172279395 ## File path: java/vector/src/main/codegen/templates/UnionVector.java ## @@ -282,6 +282,7 @@ private void reallocTypeBuffer() { long newAllocationSize = baseSize * 2L; newAllocationSize = BaseAllocator.nextPowerOfTwo(newAllocationSize); +newAllocationSize = Math.max(newAllocationSize, 1); Review comment: Sounds like we have the assumption that newAllocationSize should not be 0 because setInitializeCapacity prevents 0. I think it's better to use assert to validation the assumption. It feels more robust. WDYT? This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386450#comment-16386450 ] ASF GitHub Bot commented on ARROW-2199: --- siddharthteotia commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172277608 ## File path: java/vector/src/main/codegen/templates/UnionVector.java ## @@ -282,6 +282,7 @@ private void reallocTypeBuffer() { long newAllocationSize = baseSize * 2L; newAllocationSize = BaseAllocator.nextPowerOfTwo(newAllocationSize); +newAllocationSize = Math.max(newAllocationSize, 1); Review comment: nextPowerOfTwo of 0 is returned as 0 by BaseAllocator.nextPowerOfTwo which is why we initially safeguarded the realloc function to be aware of 0 initial capacity. Now that setInitialCapacity prevents 0 initial capacity, doing a check in realloc may not be absolutely necessary but I suggest we should keep it -- no harm This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386416#comment-16386416 ] ASF GitHub Bot commented on ARROW-2199: --- BryanCutler commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172269530 ## File path: java/vector/src/main/codegen/templates/UnionVector.java ## @@ -282,6 +282,7 @@ private void reallocTypeBuffer() { long newAllocationSize = baseSize * 2L; newAllocationSize = BaseAllocator.nextPowerOfTwo(newAllocationSize); +newAllocationSize = Math.max(newAllocationSize, 1); Review comment: I guess I mean is it possible to be negative? If not then `newAllocationSize` is a long so the only other possibility is 0 and wouldn't `nextPowerOfTwo` change the 0 to a 1? This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16386411#comment-16386411 ] ASF GitHub Bot commented on ARROW-2199: --- BryanCutler commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172268388 ## File path: java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java ## @@ -810,15 +810,6 @@ public void testSetInitialCapacity() { vector.allocateNew(); assertEquals(512, vector.getValueCapacity()); assertEquals(8, vector.getDataVector().getValueCapacity()); - - boolean error = false; - try { -vector.setInitialCapacity(5, 0.1); Review comment: Yes, that I what I mean. Still call `vector.setInitialCapacity(5, 0.1);` and just assert that capacity is 1 instead of trying to catch the exception This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16385368#comment-16385368 ] ASF GitHub Bot commented on ARROW-2199: --- siddharthteotia commented on issue #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#issuecomment-370262328 Addressed review comments. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16385363#comment-16385363 ] ASF GitHub Bot commented on ARROW-2199: --- siddharthteotia commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172063380 ## File path: java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java ## @@ -174,14 +174,16 @@ public void setInitialCapacity(int valueCount) { * @param valueCount desired number of elements in the vector * @param density average number of bytes per variable width element */ + @Override public void setInitialCapacity(int valueCount, double density) { -final long size = (long) (valueCount * density); -if (size < 1) { - throw new IllegalArgumentException("With the provided density and value count, potential capacity of the data buffer is 0"); -} +long size = (long) (valueCount * density); if (size > MAX_ALLOCATION_SIZE) { throw new OversizedAllocationException("Requested amount of memory is more than max allowed"); } + +if(size == 0) { + size = 1; +} Review comment: Done. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16385356#comment-16385356 ] ASF GitHub Bot commented on ARROW-2199: --- siddharthteotia commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172062968 ## File path: java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java ## @@ -174,14 +174,16 @@ public void setInitialCapacity(int valueCount) { * @param valueCount desired number of elements in the vector * @param density average number of bytes per variable width element */ + @Override public void setInitialCapacity(int valueCount, double density) { -final long size = (long) (valueCount * density); -if (size < 1) { - throw new IllegalArgumentException("With the provided density and value count, potential capacity of the data buffer is 0"); -} +long size = (long) (valueCount * density); if (size > MAX_ALLOCATION_SIZE) { throw new OversizedAllocationException("Requested amount of memory is more than max allowed"); } + +if(size == 0) { Review comment: Yes we cannot have an initial capacity of 0 because then our safe* functions run into an infinite loop where they try to realloc and have the target buffer size as next power of 2 -- BaseAllocator.nextPowerOfTwo returns 0 for 0 and thus safe functions keep calling realloc. This happens if the initial capacity was 0. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16385355#comment-16385355 ] ASF GitHub Bot commented on ARROW-2199: --- siddharthteotia commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172063231 ## File path: java/vector/src/main/codegen/templates/UnionVector.java ## @@ -282,6 +282,7 @@ private void reallocTypeBuffer() { long newAllocationSize = baseSize * 2L; newAllocationSize = BaseAllocator.nextPowerOfTwo(newAllocationSize); +newAllocationSize = Math.max(newAllocationSize, 1); Review comment: Now that setInitialCapacity is safeguarded to not allow an initial capacity less than 1, we may not hit this case but I think it is better to do the check in realloc as well -- else we will run in infinite loop. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16385352#comment-16385352 ] ASF GitHub Bot commented on ARROW-2199: --- siddharthteotia commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172063146 ## File path: java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java ## @@ -810,15 +810,6 @@ public void testSetInitialCapacity() { vector.allocateNew(); assertEquals(512, vector.getValueCapacity()); assertEquals(8, vector.getDataVector().getValueCapacity()); - - boolean error = false; - try { -vector.setInitialCapacity(5, 0.1); Review comment: No earlier we were throwing IllegalStateException but that shouldn't be done. Now we take a max and set it to 1. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16385354#comment-16385354 ] ASF GitHub Bot commented on ARROW-2199: --- siddharthteotia commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172063162 ## File path: java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java ## @@ -1933,15 +1933,6 @@ public void testSetInitialCapacity() { vector.allocateNew(); assertEquals(4096, vector.getValueCapacity()); assertEquals(64, vector.getDataBuffer().capacity()); - - boolean error = false; - try { -vector.setInitialCapacity(5, 0.1); Review comment: We take a max and set it to 1 if needed. Exception handling is not needed. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16385349#comment-16385349 ] ASF GitHub Bot commented on ARROW-2199: --- siddharthteotia commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172063122 ## File path: java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java ## @@ -166,13 +168,23 @@ public void setInitialCapacity(int numRecords) { *This helps in tightly controlling the memory we provision *for inner data vector. */ + @Override public void setInitialCapacity(int numRecords, double density) { +if ((numRecords * density) >= Integer.MAX_VALUE) { + throw new OversizedAllocationException("Requested amount of memory is more than max allowed"); +} offsetAllocationSizeInBytes = (numRecords + 1) * OFFSET_WIDTH; -final int innerValueCapacity = (int)(numRecords * density); -if (innerValueCapacity < 1) { - throw new IllegalArgumentException("With the provided density and value count, potential value capacity for the data vector is 0"); +int innerValueCapacity = (int)(numRecords * density); + +if(innerValueCapacity == 0) { + innerValueCapacity = 1; +} Review comment: Done. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16385347#comment-16385347 ] ASF GitHub Bot commented on ARROW-2199: --- siddharthteotia commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172062999 ## File path: java/vector/src/main/java/org/apache/arrow/vector/DensityAwareVector.java ## @@ -0,0 +1,32 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +/** + * Vector that support density aware initial capacity settings. + */ +public interface DensityAwareVector { + /** + * Set value with density + * @param valueCount + * @param density + */ + void setInitialCapacity(int valueCount, double density); + Review comment: Done. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16385345#comment-16385345 ] ASF GitHub Bot commented on ARROW-2199: --- siddharthteotia commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172062983 ## File path: java/vector/src/main/java/org/apache/arrow/vector/DensityAwareVector.java ## @@ -0,0 +1,32 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +/** + * Vector that support density aware initial capacity settings. + */ +public interface DensityAwareVector { Review comment: Done. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16385344#comment-16385344 ] ASF GitHub Bot commented on ARROW-2199: --- siddharthteotia commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172062968 ## File path: java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java ## @@ -174,14 +174,16 @@ public void setInitialCapacity(int valueCount) { * @param valueCount desired number of elements in the vector * @param density average number of bytes per variable width element */ + @Override public void setInitialCapacity(int valueCount, double density) { -final long size = (long) (valueCount * density); -if (size < 1) { - throw new IllegalArgumentException("With the provided density and value count, potential capacity of the data buffer is 0"); -} +long size = (long) (valueCount * density); if (size > MAX_ALLOCATION_SIZE) { throw new OversizedAllocationException("Requested amount of memory is more than max allowed"); } + +if(size == 0) { Review comment: Yes we cannot have an initial capacity of 0 because then our safe* functions runs into an infinite loop where they try to realloc and have the target buffer size as next power of 2 -- BaseAllocator.nextPowerOfTwo returns 0 for 0 and thus safe functions keep calling realloc. This happens if the initial capacity was 0. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16385343#comment-16385343 ] ASF GitHub Bot commented on ARROW-2199: --- siddharthteotia commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r172062884 ## File path: java/vector/src/main/java/org/apache/arrow/vector/DensityAwareVector.java ## @@ -0,0 +1,32 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +/** + * Vector that support density aware initial capacity settings. Review comment: Done. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16377362#comment-16377362 ] ASF GitHub Bot commented on ARROW-2199: --- BryanCutler commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r170692344 ## File path: java/vector/src/test/java/org/apache/arrow/vector/TestListVector.java ## @@ -810,15 +810,6 @@ public void testSetInitialCapacity() { vector.allocateNew(); assertEquals(512, vector.getValueCapacity()); assertEquals(8, vector.getDataVector().getValueCapacity()); - - boolean error = false; - try { -vector.setInitialCapacity(5, 0.1); Review comment: Is it still valid to keep this to check that the capactiy is 1 (iiuc) and only remove the exception handling? This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16377360#comment-16377360 ] ASF GitHub Bot commented on ARROW-2199: --- BryanCutler commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r170690111 ## File path: java/vector/src/main/java/org/apache/arrow/vector/DensityAwareVector.java ## @@ -0,0 +1,32 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +/** + * Vector that support density aware initial capacity settings. + */ +public interface DensityAwareVector { + /** + * Set value with density + * @param valueCount + * @param density + */ + void setInitialCapacity(int valueCount, double density); + Review comment: and remove this line This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16377358#comment-16377358 ] ASF GitHub Bot commented on ARROW-2199: --- BryanCutler commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r170689981 ## File path: java/vector/src/main/java/org/apache/arrow/vector/DensityAwareVector.java ## @@ -0,0 +1,32 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +/** + * Vector that support density aware initial capacity settings. + */ +public interface DensityAwareVector { Review comment: nit: I think there should be a blank line before the comment This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16377355#comment-16377355 ] ASF GitHub Bot commented on ARROW-2199: --- BryanCutler commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r170686874 ## File path: java/vector/src/main/java/org/apache/arrow/vector/DensityAwareVector.java ## @@ -0,0 +1,32 @@ +/** + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector; + +/** + * Vector that support density aware initial capacity settings. Review comment: Could you add some description for how the value of density affects allocation? This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16377359#comment-16377359 ] ASF GitHub Bot commented on ARROW-2199: --- BryanCutler commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r170688920 ## File path: java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java ## @@ -174,14 +174,16 @@ public void setInitialCapacity(int valueCount) { * @param valueCount desired number of elements in the vector * @param density average number of bytes per variable width element */ + @Override public void setInitialCapacity(int valueCount, double density) { -final long size = (long) (valueCount * density); -if (size < 1) { - throw new IllegalArgumentException("With the provided density and value count, potential capacity of the data buffer is 0"); -} +long size = (long) (valueCount * density); if (size > MAX_ALLOCATION_SIZE) { throw new OversizedAllocationException("Requested amount of memory is more than max allowed"); } + +if(size == 0) { Review comment: I'm not sure if this is all that useful, but this means we can't have an initial capacity of 0? Is that something that we should still consider? This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16377356#comment-16377356 ] ASF GitHub Bot commented on ARROW-2199: --- BryanCutler commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r170692729 ## File path: java/vector/src/main/codegen/templates/UnionVector.java ## @@ -282,6 +282,7 @@ private void reallocTypeBuffer() { long newAllocationSize = baseSize * 2L; newAllocationSize = BaseAllocator.nextPowerOfTwo(newAllocationSize); +newAllocationSize = Math.max(newAllocationSize, 1); Review comment: I may be missing something, but could you explain under what circumstances this would be `< 1`? is it just possible to be 0? This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16377363#comment-16377363 ] ASF GitHub Bot commented on ARROW-2199: --- BryanCutler commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r170687628 ## File path: java/vector/src/main/java/org/apache/arrow/vector/BaseVariableWidthVector.java ## @@ -174,14 +174,16 @@ public void setInitialCapacity(int valueCount) { * @param valueCount desired number of elements in the vector * @param density average number of bytes per variable width element */ + @Override public void setInitialCapacity(int valueCount, double density) { -final long size = (long) (valueCount * density); -if (size < 1) { - throw new IllegalArgumentException("With the provided density and value count, potential capacity of the data buffer is 0"); -} +long size = (long) (valueCount * density); if (size > MAX_ALLOCATION_SIZE) { throw new OversizedAllocationException("Requested amount of memory is more than max allowed"); } + +if(size == 0) { + size = 1; +} Review comment: can you combine this with above with `max` to make more compact like ``` long size = Math.max((long) (valueCount * density), 1) ``` This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16377357#comment-16377357 ] ASF GitHub Bot commented on ARROW-2199: --- BryanCutler commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r170690365 ## File path: java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java ## @@ -166,13 +168,23 @@ public void setInitialCapacity(int numRecords) { *This helps in tightly controlling the memory we provision *for inner data vector. */ + @Override public void setInitialCapacity(int numRecords, double density) { +if ((numRecords * density) >= Integer.MAX_VALUE) { + throw new OversizedAllocationException("Requested amount of memory is more than max allowed"); +} offsetAllocationSizeInBytes = (numRecords + 1) * OFFSET_WIDTH; -final int innerValueCapacity = (int)(numRecords * density); -if (innerValueCapacity < 1) { - throw new IllegalArgumentException("With the provided density and value count, potential value capacity for the data vector is 0"); +int innerValueCapacity = (int)(numRecords * density); + +if(innerValueCapacity == 0) { + innerValueCapacity = 1; +} Review comment: same as above using `Math.max` to make more compact This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16377361#comment-16377361 ] ASF GitHub Bot commented on ARROW-2199: --- BryanCutler commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r170692410 ## File path: java/vector/src/test/java/org/apache/arrow/vector/TestValueVector.java ## @@ -1933,15 +1933,6 @@ public void testSetInitialCapacity() { vector.allocateNew(); assertEquals(4096, vector.getValueCapacity()); assertEquals(64, vector.getDataBuffer().capacity()); - - boolean error = false; - try { -vector.setInitialCapacity(5, 0.1); Review comment: same here This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16375862#comment-16375862 ] ASF GitHub Bot commented on ARROW-2199: --- siddharthteotia commented on issue #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#issuecomment-368271923 Pinging for approval. Need to close out on 0.9.0 items. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16375586#comment-16375586 ] ASF GitHub Bot commented on ARROW-2199: --- icexelloss commented on issue #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#issuecomment-368231906 @siddharthteotia I am on vacation this week. I will try to review this but please do not block on me. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16373664#comment-16373664 ] ASF GitHub Bot commented on ARROW-2199: --- siddharthteotia commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r170124436 ## File path: java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java ## @@ -166,13 +168,23 @@ public void setInitialCapacity(int numRecords) { *This helps in tightly controlling the memory we provision *for inner data vector. */ + @Override public void setInitialCapacity(int numRecords, double density) { +if ((numRecords * density) >= 2_000_000_000) { Review comment: Done to safeguard against truncation. We can use Integer.MAX_VALUE instead. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16373662#comment-16373662 ] ASF GitHub Bot commented on ARROW-2199: --- siddharthteotia commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r170124342 ## File path: java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java ## @@ -166,13 +168,23 @@ public void setInitialCapacity(int numRecords) { *This helps in tightly controlling the memory we provision *for inner data vector. */ + @Override public void setInitialCapacity(int numRecords, double density) { +if ((numRecords * density) >= 2_000_000_000) { + throw new OversizedAllocationException("Requested amount of memory is more than max allowed"); Review comment: Sure, I will file a JIRA for this since this sort of problem needs to be addressed throughout the code. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16373597#comment-16373597 ] ASF GitHub Bot commented on ARROW-2199: --- vkorukanti commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r170116190 ## File path: java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java ## @@ -166,13 +168,23 @@ public void setInitialCapacity(int numRecords) { *This helps in tightly controlling the memory we provision *for inner data vector. */ + @Override public void setInitialCapacity(int numRecords, double density) { +if ((numRecords * density) >= 2_000_000_000) { + throw new OversizedAllocationException("Requested amount of memory is more than max allowed"); Review comment: If possible can we add some context here like the current capacity control variables? Useful in debugging. This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16373598#comment-16373598 ] ASF GitHub Bot commented on ARROW-2199: --- vkorukanti commented on a change in pull request #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#discussion_r170115978 ## File path: java/vector/src/main/java/org/apache/arrow/vector/complex/BaseRepeatedValueVector.java ## @@ -166,13 +168,23 @@ public void setInitialCapacity(int numRecords) { *This helps in tightly controlling the memory we provision *for inner data vector. */ + @Override public void setInitialCapacity(int numRecords, double density) { +if ((numRecords * density) >= 2_000_000_000) { Review comment: why are we using a different constant here than MAX_ALLOCATION_SIZE? This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)
[jira] [Commented] (ARROW-2199) [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is never less than 1 and propagate density throughout the vector tree
[ https://issues.apache.org/jira/browse/ARROW-2199?page=com.atlassian.jira.plugin.system.issuetabpanels:comment-tabpanel&focusedCommentId=16373583#comment-16373583 ] ASF GitHub Bot commented on ARROW-2199: --- siddharthteotia commented on issue #1646: ARROW-2199: [JAVA] Control the memory allocated for inner vectors in containers. URL: https://github.com/apache/arrow/pull/1646#issuecomment-367814904 This has fixes and improvements we did in Dremio as follow-up changes to ARROW-2019 https://github.com/apache/arrow/pull/1497. cc @vkorukanti , @BryanCutler , @icexelloss , @jacques-n, @StevenMPhillips This is an automated message from the Apache Git Service. To respond to the message, please log on GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org > [JAVA] Follow up fixes for ARROW-2019. Ensure density driven capacity is > never less than 1 and propagate density throughout the vector tree > --- > > Key: ARROW-2199 > URL: https://issues.apache.org/jira/browse/ARROW-2199 > Project: Apache Arrow > Issue Type: Improvement > Components: Java - Vectors >Reporter: Siddharth Teotia >Assignee: Siddharth Teotia >Priority: Major > Labels: pull-request-available > Fix For: 0.9.0 > > -- This message was sent by Atlassian JIRA (v7.6.3#76005)