[GitHub] [arrow] jhorstmann commented on pull request #7854: ARROW-9583: [Rust] Fix offsets in result of arithmetic kernels
jhorstmann commented on pull request #7854: URL: https://github.com/apache/arrow/pull/7854#issuecomment-670407634 @paddyhoran can you take a look at this PR? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] fredgan commented on pull request #7823: ARROW-9548: [Go] Test output files are not removed correctly
fredgan commented on pull request #7823: URL: https://github.com/apache/arrow/pull/7823#issuecomment-670420835 @sbinet Yeah, I agree. I have changed them to this. Should be OK now. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] fredgan edited a comment on pull request #7823: ARROW-9548: [Go] Test output files are not removed correctly
fredgan edited a comment on pull request #7823: URL: https://github.com/apache/arrow/pull/7823#issuecomment-670420835 @sbinet Yeah, I agree. I have changed them to this. Sorry for long time to reply. Should be OK now. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] fredgan commented on pull request #7824: ARROW-9205: [Documentation] Fix typos
fredgan commented on pull request #7824: URL: https://github.com/apache/arrow/pull/7824#issuecomment-670422135 @wesm Hi wes, do you have any option about this? if no, can you help handle it? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] fredgan edited a comment on pull request #7824: ARROW-9205: [Documentation] Fix typos
fredgan edited a comment on pull request #7824: URL: https://github.com/apache/arrow/pull/7824#issuecomment-670422135 @wesm Hi Wes, do you have any option about this? if no, can you help handle it? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] sbinet commented on a change in pull request #7823: ARROW-9548: [Go] Test output files are not removed correctly
sbinet commented on a change in pull request #7823: URL: https://github.com/apache/arrow/pull/7823#discussion_r466958913 ## File path: go/arrow/arrio/arrio_test.go ## @@ -94,19 +100,17 @@ func TestCopy(t *testing.T) { mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) defer mem.AssertSize(t, 0) - f, err := ioutil.TempFile("", "arrow-ipc-") + f, err := ioutil.TempFile(tempDir, "arrow-ipc-") if err != nil { t.Fatal(err) } defer f.Close() - defer os.Remove(f.Name()) - o, err := ioutil.TempFile("", "arrow-ipc-") + o, err := ioutil.TempFile(tempDir, "arrow-ipc-") Review comment: ditto ## File path: go/arrow/arrio/arrio_test.go ## @@ -67,6 +67,12 @@ func (k copyKind) check(t *testing.T, f *os.File, mem memory.Allocator, schema * func TestCopy(t *testing.T) { type kind int + tempDir, err := ioutil.TempDir("", "arrow-ipc-") Review comment: ```suggestion tempDir, err := ioutil.TempDir("", "arrow-io-") ``` ## File path: go/arrow/internal/arrjson/arrjson_test.go ## @@ -36,12 +42,11 @@ func TestReadWrite(t *testing.T) { mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) defer mem.AssertSize(t, 0) - f, err := ioutil.TempFile("", "arrjson-") + f, err := ioutil.TempFile(tempDir, "arrjson-") Review comment: or use the test name to derive a file name? ## File path: go/arrow/ipc/cmd/arrow-file-to-stream/main_test.go ## @@ -27,25 +27,30 @@ import ( ) func TestFileToStream(t *testing.T) { + tempDir, err := ioutil.TempDir("", "arrow-ipc-") + if err != nil { + t.Fatal(err) + } + defer os.RemoveAll(tempDir) + for name, recs := range arrdata.Records { t.Run(name, func(t *testing.T) { mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) defer mem.AssertSize(t, 0) - f, err := ioutil.TempFile("", "arrow-ipc-") + f, err := ioutil.TempFile(tempDir, "arrow-ipc-") Review comment: ditto, and for all occurrences below. ## File path: go/arrow/ipc/cmd/arrow-cat/main_test.go ## @@ -515,7 +526,7 @@ record 3/3... defer mem.AssertSize(t, 0) fname := func() string { - f, err := ioutil.TempFile("", "go-arrow-") + f, err := ioutil.TempFile(tempDir, "go-arrow-") Review comment: ditto. ## File path: go/arrow/arrio/arrio_test.go ## @@ -94,19 +100,17 @@ func TestCopy(t *testing.T) { mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) defer mem.AssertSize(t, 0) - f, err := ioutil.TempFile("", "arrow-ipc-") + f, err := ioutil.TempFile(tempDir, "arrow-ipc-") Review comment: ```suggestion f, err := ioutil.TempFile(tempDir, "arrow-io-") ``` or just use the test name? ## File path: go/arrow/ipc/cmd/arrow-cat/main_test.go ## @@ -174,7 +180,7 @@ record 3... defer mem.AssertSize(t, 0) fname := func() string { - f, err := ioutil.TempFile("", "go-arrow-") + f, err := ioutil.TempFile(tempDir, "go-arrow-") Review comment: ditto This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] rymurr commented on a change in pull request #7815: ARROW-9536: [Java] Miss parameters in PlasmaOutOfMemoryException.java
rymurr commented on a change in pull request #7815: URL: https://github.com/apache/arrow/pull/7815#discussion_r466964416 ## File path: java/plasma/src/test/java/org/apache/arrow/plasma/PlasmaClientTest.java ## @@ -277,6 +278,20 @@ public void doByteBufferTest() { client.release(id); } + public void doPlasmaOutOfMemoryExceptionTest() { +System.out.println("Start PlasmaOutOfMemoryException test."); +PlasmaClient client = (PlasmaClient) pLink; +byte[] objectId = new byte[20]; +Arrays.fill(objectId, (byte) 1); +try { + ByteBuffer byteBuffer = client.create(objectId, 2, null); + client.seal(objectId); +} catch (PlasmaOutOfMemoryException e) { + System.out.println(String.format("Expected PlasmaOutOfMemoryException: %s", e)); Review comment: out of curiosity why isn't this entire class run as a unit or integration test? This currently isn't being run as part of any CI right? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] jorisvandenbossche commented on a change in pull request #7891: ARROW-9629: [Python] Fix kartothek integration tests by fixing dependencies
jorisvandenbossche commented on a change in pull request #7891: URL: https://github.com/apache/arrow/pull/7891#discussion_r466989211 ## File path: ci/docker/conda-python-kartothek.dockerfile ## @@ -32,7 +32,14 @@ RUN conda install -c conda-forge -q \ storefact \ toolz \ urlquote \ -zstandard && \ +zstandard \ +attrs \ Review comment: Updated This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] jorisvandenbossche commented on pull request #7891: ARROW-9629: [Python] Fix kartothek integration tests by fixing dependencies
jorisvandenbossche commented on pull request #7891: URL: https://github.com/apache/arrow/pull/7891#issuecomment-670478968 cc @fjetter @xhochy in case if are familiar with those failures There are several failures like this (related to exit codes): ``` _ test_simple __ cli = ._cli at 0x7ff27b5fed40> built_cube = Cube(dimension_columns=('x', 'y'), partition_columns=('p', 'q'), uuid_prefix='my_cube', seed_dataset='source', index_columns=frozenset({'i2', 'i1'})) mock_prompt = ._f at 0x7ff27b5b6830> df_complete = i1 i2 p ... v2 x y 0 True 2018-01-01 0 ... ... 8, 9] 1 0 3 False 2019-01-01 0 ... [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,... 1 1 [4 rows x 9 columns] skv = None def test_simple(cli, built_cube, mock_prompt, df_complete, skv): mock_prompt(["", ""]) # conditions # payload result = cli("--store=cubes", "my_cube", "query", input="df\n") str_df = str(df_complete.loc[:, ["p", "q", "x", "y"]].reset_index(drop=True)) > assert result.exit_code == 1 E assert 2 == 1 E+ where 2 = .exit_code /kartothek/tests/cli/test_query.py:64: AssertionError ``` and there are a few failures like ``` test_predicate_pushdown_null_col[True-serialiser4-df3-value3] _ store = , df = t 0 NaT value = Timestamp('2017-01-01 00:00:00'), predicate_pushdown_to_io = True serialiser = CsvSerializer(compress=True) @pytest.mark.parametrize( "df,value", [ (pd.DataFrame({"u": pd.Series([None], dtype=object)}), "foo"), (pd.DataFrame({"b": pd.Series([None], dtype=object)}), b"foo"), (pd.DataFrame({"f": pd.Series([np.nan], dtype=float)}), 1.2), ( pd.DataFrame({"t": pd.Series([pd.NaT], dtype="datetime64[ns]")}), pd.Timestamp("2017"), ), ], ) @predicate_serialisers @pytest.mark.parametrize("predicate_pushdown_to_io", [True, False]) def test_predicate_pushdown_null_col( store, df, value, predicate_pushdown_to_io, serialiser ): key = serialiser.store(store, "prefix", df) expected = df.iloc[[]].copy() predicates = [[(df.columns[0], "==", value)]] result = serialiser.restore_dataframe( store, key, predicate_pushdown_to_io=predicate_pushdown_to_io, predicates=predicates, ) pdt.assert_frame_equal( result.reset_index(drop=True), expected.reset_index(drop=True), > check_dtype=serialiser.type_stable, ) tests/serialization/test_dataframe.py:529: _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ left = array([], dtype=float64) right = [] Length: 0, dtype: datetime64[ns], check_dtype = True index_values = array([], dtype=int64) check_less_precise = , check_exact = False rtol = 1e-05, atol = 1e-08 > assert isinstance(left, ExtensionArray), "left is not an ExtensionArray" E AssertionError: left is not an ExtensionArray /opt/conda/envs/arrow/lib/python3.7/site-packages/pandas/_testing.py:1175: AssertionError ``` Since this is related to a pandas testing utility, does kartothek already support pandas 1.1? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] jorisvandenbossche commented on pull request #7891: ARROW-9629: [Python] Fix kartothek integration tests by fixing dependencies
jorisvandenbossche commented on pull request #7891: URL: https://github.com/apache/arrow/pull/7891#issuecomment-670479036 @github-actions crossbow submit test-conda-python-3.7-kartothek-latest test-conda-python-3.7-kartothek-master This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] github-actions[bot] commented on pull request #7891: ARROW-9629: [Python] Fix kartothek integration tests by fixing dependencies
github-actions[bot] commented on pull request #7891: URL: https://github.com/apache/arrow/pull/7891#issuecomment-670479711 Revision: e21e9ff23f98427d1898f8de99945549e2fabe12 Submitted crossbow builds: [ursa-labs/crossbow @ actions-468](https://github.com/ursa-labs/crossbow/branches/all?query=actions-468) |Task|Status| ||--| |test-conda-python-3.7-kartothek-latest|[![Github Actions](https://github.com/ursa-labs/crossbow/workflows/Crossbow/badge.svg?branch=actions-468-github-test-conda-python-3.7-kartothek-latest)](https://github.com/ursa-labs/crossbow/actions?query=branch:actions-468-github-test-conda-python-3.7-kartothek-latest)| |test-conda-python-3.7-kartothek-master|[![Github Actions](https://github.com/ursa-labs/crossbow/workflows/Crossbow/badge.svg?branch=actions-468-github-test-conda-python-3.7-kartothek-master)](https://github.com/ursa-labs/crossbow/actions?query=branch:actions-468-github-test-conda-python-3.7-kartothek-master)| This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] liyafan82 commented on a change in pull request #7248: ARROW-8402: [Java] Support ValidateFull methods in Java
liyafan82 commented on a change in pull request #7248: URL: https://github.com/apache/arrow/pull/7248#discussion_r467003950 ## File path: java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorBufferVisitor.java ## @@ -0,0 +1,221 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.validate; + +import static org.apache.arrow.vector.validate.ValidateUtility.validateOrThrow; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.vector.BaseFixedWidthVector; +import org.apache.arrow.vector.BaseLargeVariableWidthVector; +import org.apache.arrow.vector.BaseVariableWidthVector; +import org.apache.arrow.vector.BitVector; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.NullVector; +import org.apache.arrow.vector.TypeLayout; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.compare.VectorVisitor; +import org.apache.arrow.vector.complex.DenseUnionVector; +import org.apache.arrow.vector.complex.FixedSizeListVector; +import org.apache.arrow.vector.complex.LargeListVector; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.NonNullableStructVector; +import org.apache.arrow.vector.complex.UnionVector; +import org.apache.arrow.vector.types.pojo.ArrowType; + +/** + * Visitor to validate vector buffers. + */ +public class ValidateVectorBufferVisitor implements VectorVisitor { + + private void validateVectorCommon(ValueVector vector) { +ArrowType arrowType = vector.getField().getType(); +validateOrThrow(vector.getValueCount() >= 0, "vector valueCount is negative"); + +if (vector instanceof FieldVector) { + FieldVector fieldVector = (FieldVector) vector; + int typeBufferCount = TypeLayout.getTypeBufferCount(arrowType); + validateOrThrow(fieldVector.getFieldBuffers().size() == typeBufferCount, + String.format("Expected %s buffers in vector of type %s, got %s", + typeBufferCount, vector.getField().getType().toString(), fieldVector.getFieldBuffers().size())); +} + } + + private void validateValidityBuffer(ValueVector vector, int valueCount) { +ArrowBuf validityBuffer = vector.getValidityBuffer(); +validateOrThrow(validityBuffer != null, "The validity buffer is null."); +validateOrThrow(validityBuffer.capacity() * 8 >= valueCount, "No enough capacity for the validity buffer."); + } + + private void validateOffsetBuffer(ValueVector vector, int valueCount) { +ArrowBuf offsetBuffer = vector.getOffsetBuffer(); +validateOrThrow(offsetBuffer != null, "The offset buffer is null."); +if (valueCount > 0) { + validateOrThrow(offsetBuffer.capacity() >= (valueCount + 1) * 4, "No enough capacity for the offset buffer."); +} + } + + private void validateLargeOffsetBuffer(ValueVector vector, int valueCount) { +ArrowBuf offsetBuffer = vector.getOffsetBuffer(); +validateOrThrow(offsetBuffer != null, "The large offset buffer is null."); +if (valueCount > 0) { + validateOrThrow(offsetBuffer.capacity() >= (valueCount + 1) * 8, + "No enough capacity for the large offset buffer."); +} + } + + private void validateFixedWidthDataBuffer(ValueVector vector, int valueCount, int bitWidth) { +ArrowBuf dataBuffer = vector.getDataBuffer(); +validateOrThrow(dataBuffer != null, "The fixed width data buffer is null."); +validateOrThrow((long) bitWidth * valueCount <= dataBuffer.capacity() * 8L, +"No enough capacity for fixed width data buffer"); + } + + private void validateDataBuffer(ValueVector vector, long minCapacity) { +ArrowBuf dataBuffer = vector.getDataBuffer(); +validateOrThrow(dataBuffer != null, "The data buffer is null."); +validateOrThrow(dataBuffer.capacity() >= minCapacity, "No enough capacity for data buffer"); + } + + @Override + public Void visit(BaseFixedWidthVector vector, Void value) { +int bitWidth = (vector instanceof BitVector) ? 1 : vector.getTypeWidth() * 8; +int valueCount = vector.getValueCount(); +validateVectorCommon(vector); +validateValidityBuffer(vector, valueCount); +validateFixedWidthDataBuffer(vector, value
[GitHub] [arrow] liyafan82 commented on a change in pull request #7248: ARROW-8402: [Java] Support ValidateFull methods in Java
liyafan82 commented on a change in pull request #7248: URL: https://github.com/apache/arrow/pull/7248#discussion_r467004312 ## File path: java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorBufferVisitor.java ## @@ -0,0 +1,221 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.validate; + +import static org.apache.arrow.vector.validate.ValidateUtility.validateOrThrow; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.vector.BaseFixedWidthVector; +import org.apache.arrow.vector.BaseLargeVariableWidthVector; +import org.apache.arrow.vector.BaseVariableWidthVector; +import org.apache.arrow.vector.BitVector; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.NullVector; +import org.apache.arrow.vector.TypeLayout; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.compare.VectorVisitor; +import org.apache.arrow.vector.complex.DenseUnionVector; +import org.apache.arrow.vector.complex.FixedSizeListVector; +import org.apache.arrow.vector.complex.LargeListVector; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.NonNullableStructVector; +import org.apache.arrow.vector.complex.UnionVector; +import org.apache.arrow.vector.types.pojo.ArrowType; + +/** + * Visitor to validate vector buffers. + */ +public class ValidateVectorBufferVisitor implements VectorVisitor { + + private void validateVectorCommon(ValueVector vector) { +ArrowType arrowType = vector.getField().getType(); +validateOrThrow(vector.getValueCount() >= 0, "vector valueCount is negative"); + +if (vector instanceof FieldVector) { + FieldVector fieldVector = (FieldVector) vector; + int typeBufferCount = TypeLayout.getTypeBufferCount(arrowType); + validateOrThrow(fieldVector.getFieldBuffers().size() == typeBufferCount, + String.format("Expected %s buffers in vector of type %s, got %s", + typeBufferCount, vector.getField().getType().toString(), fieldVector.getFieldBuffers().size())); +} + } + + private void validateValidityBuffer(ValueVector vector, int valueCount) { +ArrowBuf validityBuffer = vector.getValidityBuffer(); +validateOrThrow(validityBuffer != null, "The validity buffer is null."); +validateOrThrow(validityBuffer.capacity() * 8 >= valueCount, "No enough capacity for the validity buffer."); + } + + private void validateOffsetBuffer(ValueVector vector, int valueCount) { +ArrowBuf offsetBuffer = vector.getOffsetBuffer(); +validateOrThrow(offsetBuffer != null, "The offset buffer is null."); +if (valueCount > 0) { + validateOrThrow(offsetBuffer.capacity() >= (valueCount + 1) * 4, "No enough capacity for the offset buffer."); +} + } + + private void validateLargeOffsetBuffer(ValueVector vector, int valueCount) { +ArrowBuf offsetBuffer = vector.getOffsetBuffer(); +validateOrThrow(offsetBuffer != null, "The large offset buffer is null."); +if (valueCount > 0) { + validateOrThrow(offsetBuffer.capacity() >= (valueCount + 1) * 8, + "No enough capacity for the large offset buffer."); +} + } + + private void validateFixedWidthDataBuffer(ValueVector vector, int valueCount, int bitWidth) { +ArrowBuf dataBuffer = vector.getDataBuffer(); +validateOrThrow(dataBuffer != null, "The fixed width data buffer is null."); +validateOrThrow((long) bitWidth * valueCount <= dataBuffer.capacity() * 8L, +"No enough capacity for fixed width data buffer"); + } + + private void validateDataBuffer(ValueVector vector, long minCapacity) { +ArrowBuf dataBuffer = vector.getDataBuffer(); +validateOrThrow(dataBuffer != null, "The data buffer is null."); +validateOrThrow(dataBuffer.capacity() >= minCapacity, "No enough capacity for data buffer"); + } + + @Override + public Void visit(BaseFixedWidthVector vector, Void value) { +int bitWidth = (vector instanceof BitVector) ? 1 : vector.getTypeWidth() * 8; +int valueCount = vector.getValueCount(); +validateVectorCommon(vector); +validateValidityBuffer(vector, valueCount); +validateFixedWidthDataBuffer(vector, value
[GitHub] [arrow] liyafan82 commented on a change in pull request #7248: ARROW-8402: [Java] Support ValidateFull methods in Java
liyafan82 commented on a change in pull request #7248: URL: https://github.com/apache/arrow/pull/7248#discussion_r467004875 ## File path: java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorDataVisitor.java ## @@ -0,0 +1,148 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.validate; + +import static org.apache.arrow.vector.validate.ValidateUtility.validateOrThrow; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.vector.BaseFixedWidthVector; +import org.apache.arrow.vector.BaseLargeVariableWidthVector; +import org.apache.arrow.vector.BaseVariableWidthVector; +import org.apache.arrow.vector.NullVector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.compare.VectorVisitor; +import org.apache.arrow.vector.complex.DenseUnionVector; +import org.apache.arrow.vector.complex.FixedSizeListVector; +import org.apache.arrow.vector.complex.LargeListVector; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.NonNullableStructVector; +import org.apache.arrow.vector.complex.UnionVector; + +/** + * Utility for validating vector data. + */ +public class ValidateVectorDataVisitor implements VectorVisitor { + + private void validateOffsetBuffer(ValueVector vector, int valueCount) { +if (valueCount == 0) { + return; +} +ArrowBuf offsetBuffer = vector.getOffsetBuffer(); + +// verify that the values in the offset buffer is non-decreasing +int prevValue = offsetBuffer.getInt(0); +for (int i = 1; i <= valueCount; i++) { Review comment: Sure. Thanks for your kind reminder. We provide separate check logic for dense union vectors in the revised PR. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] liyafan82 commented on a change in pull request #7248: ARROW-8402: [Java] Support ValidateFull methods in Java
liyafan82 commented on a change in pull request #7248: URL: https://github.com/apache/arrow/pull/7248#discussion_r467005269 ## File path: java/vector/src/test/java/org/apache/arrow/vector/validate/TestValidateVectorFull.java ## @@ -0,0 +1,158 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.validate; + +import static org.apache.arrow.vector.testing.ValueVectorDataPopulator.setVector; +import static org.apache.arrow.vector.util.ValueVectorUtility.validateFull; +import static org.junit.Assert.assertTrue; +import static org.junit.jupiter.api.Assertions.assertThrows; + +import java.util.Arrays; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.IntVector; +import org.apache.arrow.vector.LargeVarCharVector; +import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.complex.LargeListVector; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.StructVector; +import org.apache.arrow.vector.testing.ValueVectorDataPopulator; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + +public class TestValidateVectorFull { + + private BufferAllocator allocator; + + @Before + public void init() { +allocator = new RootAllocator(Long.MAX_VALUE); + } + + @After + public void terminate() throws Exception { +allocator.close(); + } + + @Test + public void testBaseVariableWidthVector() { +try (final VarCharVector vector = new VarCharVector("v", allocator)) { + validateFull(vector); + setVector(vector, "aaa", "bbb", "ccc"); + validateFull(vector); + + ArrowBuf offsetBuf = vector.getOffsetBuffer(); + offsetBuf.setInt(0, 100); + offsetBuf.setInt(4, 50); + + ValidateUtility.ValidateException e = assertThrows(ValidateUtility.ValidateException.class, + () -> validateFull(vector)); + assertTrue(e.getMessage().contains("The values in the offset buffer are decreasing")); +} + } + + @Test + public void testBaseLargeVariableWidthVector() { +try (final LargeVarCharVector vector = new LargeVarCharVector("v", allocator)) { + validateFull(vector); + setVector(vector, "aaa", "bbb", null, "ccc"); + validateFull(vector); + + ArrowBuf offsetBuf = vector.getOffsetBuffer(); + offsetBuf.setInt(0, 100); + offsetBuf.setInt(4, 50); + + ValidateUtility.ValidateException e = assertThrows(ValidateUtility.ValidateException.class, + () -> validateFull(vector)); + assertTrue(e.getMessage().contains("The values in the large offset buffer are decreasing")); +} + } + + @Test + public void testListVector() { +try (final ListVector vector = ListVector.empty("v", allocator)) { + validateFull(vector); + setVector(vector, Arrays.asList(1, 2, 3), Arrays.asList(4, 5), Arrays.asList(6, 7, 8, 9)); + validateFull(vector); + + ArrowBuf offsetBuf = vector.getOffsetBuffer(); + offsetBuf.setInt(0, 100); + offsetBuf.setInt(8, 50); + + ValidateUtility.ValidateException e = assertThrows(ValidateUtility.ValidateException.class, + () -> validateFull(vector)); + assertTrue(e.getMessage().contains("The values in the offset buffer are decreasing")); +} + } + + @Test + public void testLargeListVector() { +try (final LargeListVector vector = LargeListVector.empty("v", allocator)) { + validateFull(vector); + setVector(vector, Arrays.asList(1, 2, 3), Arrays.asList(4, 5), Arrays.asList(6, 7, 8, 9)); + validateFull(vector); + + ArrowBuf offsetBuf = vector.getOffsetBuffer(); + offsetBuf.setLong(0, 100); + offsetBuf.setLong(16, 50); + + ValidateUtility.ValidateException e = assertThrows(ValidateUtility.ValidateException.class, + () -> validateFull(vector)); + assertTrue(e.getMessage().contains("The values in the large offset buffer are decreasing")); +} + } + + @Test + public void testStructVectorRangeEquals() {
[GitHub] [arrow] fredgan commented on a change in pull request #7823: ARROW-9548: [Go] Test output files are not removed correctly
fredgan commented on a change in pull request #7823: URL: https://github.com/apache/arrow/pull/7823#discussion_r467012651 ## File path: go/arrow/internal/arrjson/arrjson_test.go ## @@ -36,12 +42,11 @@ func TestReadWrite(t *testing.T) { mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) defer mem.AssertSize(t, 0) - f, err := ioutil.TempFile("", "arrjson-") + f, err := ioutil.TempFile(tempDir, "arrjson-") Review comment: @sbinet Well, I prefer the test function name. And maybe add "go-arrow-" prefix will be better. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] jorisvandenbossche commented on pull request #7907: ARROW-9644: [C++][Dataset] Don't apply ignore_prefixes to partition base_dir
jorisvandenbossche commented on pull request #7907: URL: https://github.com/apache/arrow/pull/7907#issuecomment-670496983 Added the specific case of a directory to the existing test about this This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] sbinet commented on a change in pull request #7823: ARROW-9548: [Go] Test output files are not removed correctly
sbinet commented on a change in pull request #7823: URL: https://github.com/apache/arrow/pull/7823#discussion_r467016096 ## File path: go/arrow/internal/arrjson/arrjson_test.go ## @@ -36,12 +42,11 @@ func TestReadWrite(t *testing.T) { mem := memory.NewCheckedAllocator(memory.NewGoAllocator()) defer mem.AssertSize(t, 0) - f, err := ioutil.TempFile("", "arrjson-") + f, err := ioutil.TempFile(tempDir, "arrjson-") Review comment: fair enough. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] sbinet closed pull request #7823: ARROW-9548: [Go] Test output files are not removed correctly
sbinet closed pull request #7823: URL: https://github.com/apache/arrow/pull/7823 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] jorisvandenbossche commented on pull request #7793: ARROW-9107: [C++][Dataset] Support temporal partitioning fields
jorisvandenbossche commented on pull request #7793: URL: https://github.com/apache/arrow/pull/7793#issuecomment-670506980 One note on this: officially, ISO 8601 says one has to use "T" as separator for the date and time component of a string, and not a white space (although in practice of course the version with a space is used a lot). This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] vertexclique commented on pull request #7894: ARROW-9631: [Rust] Make arrow not depend on flight
vertexclique commented on pull request #7894: URL: https://github.com/apache/arrow/pull/7894#issuecomment-670545087 @andygrove Seems like done. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] nealrichardson commented on a change in pull request #7911: ARROW-9606: [C++][Dataset] Support `"a"_.In(<>).Assume()`
nealrichardson commented on a change in pull request #7911: URL: https://github.com/apache/arrow/pull/7911#discussion_r467099186 ## File path: r/tests/testthat/test-dataset.R ## @@ -391,6 +391,17 @@ test_that("filter() with %in%", { collect(), tibble(int = df1$int[c(3, 4, 6)], part = 1) ) + + ds <- open_dataset(hive_dir, partitioning = hive_partition(other = utf8(), group = uint8())) Review comment: It's hive-style so you don't need to tell it what the partitions are. ```suggestion ds <- open_dataset(hive_dir) ``` ## File path: r/tests/testthat/test-dataset.R ## @@ -391,6 +391,17 @@ test_that("filter() with %in%", { collect(), tibble(int = df1$int[c(3, 4, 6)], part = 1) ) + Review comment: ```suggestion # ARROW-9606: bug in %in% filter on partition column with >1 partition columns ``` ## File path: r/tests/testthat/test-dataset.R ## @@ -391,6 +391,17 @@ test_that("filter() with %in%", { collect(), tibble(int = df1$int[c(3, 4, 6)], part = 1) ) + + ds <- open_dataset(hive_dir, partitioning = hive_partition(other = utf8(), group = uint8())) + expect_equivalent( +ds %>% + filter(group %in% c(2)) %>% + select(chr, dbl) %>% + filter(dbl > 7 & dbl < 53) %>% + collect() %>% + arrange(dbl), +df2[1:2, c("chr", "dbl")] Review comment: A simpler test better shows what this is testing. I confirmed that this fails on master per the user report. ```suggestion filter(group %in% 2) %>% collect(), df2 ``` This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] nealrichardson commented on pull request #7903: ARROW-9643: [C++] Only register the SIMD variants when it's supported.
nealrichardson commented on pull request #7903: URL: https://github.com/apache/arrow/pull/7903#issuecomment-670564397 @github-actions crossbow submit -g r This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] github-actions[bot] commented on pull request #7903: ARROW-9643: [C++] Only register the SIMD variants when it's supported.
github-actions[bot] commented on pull request #7903: URL: https://github.com/apache/arrow/pull/7903#issuecomment-670565444 Revision: 4300febdbf53bfebc98e5cea88b4b065ffc22041 Submitted crossbow builds: [ursa-labs/crossbow @ actions-469](https://github.com/ursa-labs/crossbow/branches/all?query=actions-469) |Task|Status| ||--| |homebrew-r-autobrew|[![TravisCI](https://img.shields.io/travis/ursa-labs/crossbow/actions-469-travis-homebrew-r-autobrew.svg)](https://travis-ci.org/ursa-labs/crossbow/branches)| |test-conda-r-4.0|[![Github Actions](https://github.com/ursa-labs/crossbow/workflows/Crossbow/badge.svg?branch=actions-469-github-test-conda-r-4.0)](https://github.com/ursa-labs/crossbow/actions?query=branch:actions-469-github-test-conda-r-4.0)| |test-r-linux-as-cran|[![Github Actions](https://github.com/ursa-labs/crossbow/workflows/Crossbow/badge.svg?branch=actions-469-github-test-r-linux-as-cran)](https://github.com/ursa-labs/crossbow/actions?query=branch:actions-469-github-test-r-linux-as-cran)| |test-r-rhub-ubuntu-gcc-release|[![Azure](https://dev.azure.com/ursa-labs/crossbow/_apis/build/status/ursa-labs.crossbow?branchName=actions-469-azure-test-r-rhub-ubuntu-gcc-release)](https://dev.azure.com/ursa-labs/crossbow/_build/latest?definitionId=1&branchName=actions-469-azure-test-r-rhub-ubuntu-gcc-release)| |test-r-rocker-r-base-latest|[![Azure](https://dev.azure.com/ursa-labs/crossbow/_apis/build/status/ursa-labs.crossbow?branchName=actions-469-azure-test-r-rocker-r-base-latest)](https://dev.azure.com/ursa-labs/crossbow/_build/latest?definitionId=1&branchName=actions-469-azure-test-r-rocker-r-base-latest)| |test-r-rstudio-r-base-3.6-bionic|[![Azure](https://dev.azure.com/ursa-labs/crossbow/_apis/build/status/ursa-labs.crossbow?branchName=actions-469-azure-test-r-rstudio-r-base-3.6-bionic)](https://dev.azure.com/ursa-labs/crossbow/_build/latest?definitionId=1&branchName=actions-469-azure-test-r-rstudio-r-base-3.6-bionic)| |test-r-rstudio-r-base-3.6-centos6|[![Azure](https://dev.azure.com/ursa-labs/crossbow/_apis/build/status/ursa-labs.crossbow?branchName=actions-469-azure-test-r-rstudio-r-base-3.6-centos6)](https://dev.azure.com/ursa-labs/crossbow/_build/latest?definitionId=1&branchName=actions-469-azure-test-r-rstudio-r-base-3.6-centos6)| |test-r-rstudio-r-base-3.6-centos8|[![Azure](https://dev.azure.com/ursa-labs/crossbow/_apis/build/status/ursa-labs.crossbow?branchName=actions-469-azure-test-r-rstudio-r-base-3.6-centos8)](https://dev.azure.com/ursa-labs/crossbow/_build/latest?definitionId=1&branchName=actions-469-azure-test-r-rstudio-r-base-3.6-centos8)| |test-r-rstudio-r-base-3.6-opensuse15|[![Azure](https://dev.azure.com/ursa-labs/crossbow/_apis/build/status/ursa-labs.crossbow?branchName=actions-469-azure-test-r-rstudio-r-base-3.6-opensuse15)](https://dev.azure.com/ursa-labs/crossbow/_build/latest?definitionId=1&branchName=actions-469-azure-test-r-rstudio-r-base-3.6-opensuse15)| |test-r-rstudio-r-base-3.6-opensuse42|[![Azure](https://dev.azure.com/ursa-labs/crossbow/_apis/build/status/ursa-labs.crossbow?branchName=actions-469-azure-test-r-rstudio-r-base-3.6-opensuse42)](https://dev.azure.com/ursa-labs/crossbow/_build/latest?definitionId=1&branchName=actions-469-azure-test-r-rstudio-r-base-3.6-opensuse42)| |test-ubuntu-18.04-r-sanitizer|[![Azure](https://dev.azure.com/ursa-labs/crossbow/_apis/build/status/ursa-labs.crossbow?branchName=actions-469-azure-test-ubuntu-18.04-r-sanitizer)](https://dev.azure.com/ursa-labs/crossbow/_build/latest?definitionId=1&branchName=actions-469-azure-test-ubuntu-18.04-r-sanitizer)| This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] jorisvandenbossche closed pull request #7821: ARROW-9546: [Python] Clean up Pandas Metadata Conversion test
jorisvandenbossche closed pull request #7821: URL: https://github.com/apache/arrow/pull/7821 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] jorisvandenbossche commented on pull request #7821: ARROW-9546: [Python] Clean up Pandas Metadata Conversion test
jorisvandenbossche commented on pull request #7821: URL: https://github.com/apache/arrow/pull/7821#issuecomment-670567980 Thanks! This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] andygrove commented on pull request #7894: ARROW-9631: [Rust] Make arrow not depend on flight
andygrove commented on pull request #7894: URL: https://github.com/apache/arrow/pull/7894#issuecomment-670575931 Thanks @vertexclique ... your regex skills are better than mine. One last thing ... we probably shouldn't have flight protocol changes as part of this PR. Could you revert those? I'm confused why these are still appearing and this is something I am going to look into next week. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] fredgan commented on pull request #7752: ARROW-9462:[Go] The Indentation after the first Record in arrjson writer is incorrect
fredgan commented on pull request #7752: URL: https://github.com/apache/arrow/pull/7752#issuecomment-670601511 @sbinet OK, good point. I renamed them, and moved them to `arrjson_test.go`. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] sbinet closed pull request #7752: ARROW-9462:[Go] The Indentation after the first Record in arrjson writer is incorrect
sbinet closed pull request #7752: URL: https://github.com/apache/arrow/pull/7752 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] fredgan commented on pull request #7824: ARROW-9205: [Documentation] Fix typos
fredgan commented on pull request #7824: URL: https://github.com/apache/arrow/pull/7824#issuecomment-670605044 @sbinet Hi, It's not about Go part, but I just fix some typos when I read the Document. Can you also help handle it? It's been a long time. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] wesm closed pull request #7903: ARROW-9643: [C++] Only register the SIMD variants when it's supported.
wesm closed pull request #7903: URL: https://github.com/apache/arrow/pull/7903 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] bkietz commented on a change in pull request #7819: ARROW-9405: [R] Switch to cpp11
bkietz commented on a change in pull request #7819: URL: https://github.com/apache/arrow/pull/7819#discussion_r467084647 ## File path: r/src/array_from_vector.cpp ## @@ -406,9 +403,12 @@ std::shared_ptr MakeFactorArray(Rcpp::IntegerVector_ factor, case Type::INT64: return MakeFactorArrayImpl(factor, type); default: - Rcpp::stop(tfm::format("Cannot convert to dictionary with index_type %s", - dict_type.index_type()->ToString())); + break; } + + cpp11::stop("Cannot convert to dictionary with index_type '%s'", + dict_type.index_type()->ToString().c_str()); + return nullptr; Review comment: There *should* be no need to return here since `cpp11::stop()` is marked `[[noreturn]]` ## File path: r/src/array_from_vector.cpp ## @@ -1064,42 +1063,42 @@ class FixedSizeBinaryVectorConverter : public VectorConverter { FixedSizeBinaryBuilder* typed_builder_; }; -template +template class StringVectorConverter : public VectorConverter { public: ~StringVectorConverter() {} Status Init(ArrayBuilder* builder) { -typed_builder_ = checked_cast(builder); +typed_builder_ = checked_cast(builder); return Status::OK(); } Status Ingest(SEXP obj) { ARROW_RETURN_IF(TYPEOF(obj) != STRSXP, Status::RError("Expecting a character vector")); -R_xlen_t n = XLENGTH(obj); -// Reserve enough space before appending -int64_t size = 0; -for (R_xlen_t i = 0; i < n; i++) { - SEXP string_i = STRING_ELT(obj, i); - if (string_i != NA_STRING) { -size += XLENGTH(Rf_mkCharCE(Rf_translateCharUTF8(string_i), CE_UTF8)); - } +cpp11::strings s(obj); +RETURN_NOT_OK(typed_builder_->Reserve(s.size())); + +// note: the total length is calculated without utf8 +// conversion, so see this more as a hint rather than +// the actual total length +auto total_length_hint = 0; Review comment: ```suggestion int64_t total_length_hint = 0; ``` ## File path: r/src/arrow_cpp11.h ## @@ -0,0 +1,243 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#undef Free + +namespace cpp11 { + +template +SEXP as_sexp(const std::shared_ptr& ptr); + +template +SEXP as_sexp(const std::vector>& vec); + +template ::value>::type* = nullptr> +SEXP as_sexp(E e); + +} // namespace cpp11 + +#include Review comment: please keep all includes at the top of the file ## File path: r/src/arrow_cpp11.h ## @@ -0,0 +1,243 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#undef Free + +namespace cpp11 { + +template +SEXP as_sexp(const std::shared_ptr& ptr); + +template +SEXP as_sexp(const std::vector>& vec); + +template ::value>::type* = nullptr> +SEXP as_sexp(E e); + +} // namespace cpp11 + +#include + +namespace arrow { +namespace r { +struct symbols { + static SEXP units; + static SEXP tzone; + static SEXP xp; + static SEXP dot_Internal; + static SEXP inspect; + static SEXP row_names; + static SEXP serialize_arrow_r_metadata; + static SEXP as_list; + static SEXP ptype; + static SEXP byte_width; + static SEXP list_size; +}; + +struct data { + static SEXP classes_POSIXct; + static SEXP classes_metadata_r; + static SEXP classes_vctrs_list_of; + static SEXP classes_tb
[GitHub] [arrow] kou commented on a change in pull request #7824: ARROW-9205: [Documentation] Fix typos
kou commented on a change in pull request #7824: URL: https://github.com/apache/arrow/pull/7824#discussion_r467259196 ## File path: docs/source/format/Columnar.rst ## @@ -76,7 +76,7 @@ concepts, here is a small glossary to help disambiguate. * **Nested type**: a data type whose full structure depends on one or more other child types. Two fully-specified nested types are equal if and only if their child types are equal. For example, ``List`` - is distinct from ``List`` iff U and V are different types. + is distinct from ``List`` if U and V are different types. Review comment: I think that we should choose correctness than simpleness here because we explain specification here. If we can keep this "iff", I can accept and merge this. If we can't, we need other commiter's review. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] MingyuZhong opened a new pull request #7915: Fix a bug BasicDecimal128 constructor that interprets uint64_t integers with highest bit set as negative.
MingyuZhong opened a new pull request #7915: URL: https://github.com/apache/arrow/pull/7915 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] emkornfield commented on pull request #7915: ARROW-9671: [C++] Fix a bug in BasicDecimal128 constructor that interprets uint64_t integers with highest bit set as negative.
emkornfield commented on pull request #7915: URL: https://github.com/apache/arrow/pull/7915#issuecomment-670721077 Looks like formatting needs to be run: ``` template ::value && - (sizeof(T) <= sizeof(uint64_t)), T>::type> +typename = typename std::enable_if< +std::is_integral::value && (sizeof(T) <= sizeof(uint64_t)), T>::type> constexpr BasicDecimal128(T value) noexcept - : BasicDecimal128(value >= T{0} ? 0 : -1, -static_cast(value)) {} + : BasicDecimal128(value >= T{0} ? 0 : -1, static_cast(value)) {} ``` This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] github-actions[bot] commented on pull request #7915: ARROW-9671: [C++] Fix a bug in BasicDecimal128 constructor that interprets uint64_t integers with highest bit set as negative.
github-actions[bot] commented on pull request #7915: URL: https://github.com/apache/arrow/pull/7915#issuecomment-670721481 https://issues.apache.org/jira/browse/ARROW-9671 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] wesm commented on pull request #7798: ARROW-9523 [Rust] Improve filter kernel performance
wesm commented on pull request #7798: URL: https://github.com/apache/arrow/pull/7798#issuecomment-670723322 This PR makes me think that at some point (when someone gets really motivated), it would be interesting to implement a non-language-dependent benchmark harness for certain operations (such that can be represented using e.g. a standard protobuf serialized operation/expression) so that we can get apples-to-apples numbers for certain operations across implementations. It would be interesting to know e.g. how the Rust implementation of filtering compares with the C++ This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] wesm commented on issue #7910: OSError: Could not connect to socket
wesm commented on issue #7910: URL: https://github.com/apache/arrow/issues/7910#issuecomment-670723509 Could you open a JIRA issue and describe in more detail the scenario that leads to this error? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] andygrove commented on pull request #7798: ARROW-9523 [Rust] Improve filter kernel performance
andygrove commented on pull request #7798: URL: https://github.com/apache/arrow/pull/7798#issuecomment-670725220 I completely agree. I just happen to have a protobuf definition for plans and expressions as well as serde code for Java and Rust that I would be happy to contribute. We would need to implement for C++. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] eerhardt commented on pull request #6122: ARROW-7040: [C#] - System.Memory Span.CopyTo - Crashes
eerhardt commented on pull request #6122: URL: https://github.com/apache/arrow/pull/6122#issuecomment-670770541 Just a follow up here - a fix has been checked into .NET Framework 4.7 and 4.8 that makes this change no longer necessary. According to https://github.com/dotnet/runtime/issues/13701#issuecomment-670722879 the fix is scheduled to be deployed in September. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] fredgan commented on a change in pull request #7824: ARROW-9205: [Documentation] Fix typos
fredgan commented on a change in pull request #7824: URL: https://github.com/apache/arrow/pull/7824#discussion_r467334354 ## File path: docs/source/format/Columnar.rst ## @@ -76,7 +76,7 @@ concepts, here is a small glossary to help disambiguate. * **Nested type**: a data type whose full structure depends on one or more other child types. Two fully-specified nested types are equal if and only if their child types are equal. For example, ``List`` - is distinct from ``List`` iff U and V are different types. + is distinct from ``List`` if U and V are different types. Review comment: @kou Well, I give up the modification for `iff` now. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] wesm commented on a change in pull request #7816: ARROW-9528: [Python] Honor tzinfo when converting from datetime
wesm commented on a change in pull request #7816: URL: https://github.com/apache/arrow/pull/7816#discussion_r467330463 ## File path: ci/scripts/integration_spark.sh ## @@ -22,6 +22,9 @@ source_dir=${1} spark_dir=${2} spark_version=${SPARK_VERSION:-master} +# Use old behavior that always dropped tiemzones. +export ARROW_NO_TZ=1 Review comment: Nit: might be better to call this something more explicit like `PYARROW_PANDAS_DROP_NESTED_TZ` (since it only impacts tz-aware datetimes inside e.g. structs, right?) ## File path: cpp/src/arrow/python/arrow_to_pandas.cc ## @@ -642,24 +641,27 @@ inline Status ConvertStruct(const PandasOptions& options, const ChunkedArray& da std::vector fields_data(num_fields); OwnedRef dict_item; - // XXX(wesm): In ARROW-7723, we found as a result of ARROW-3789 that second + // In ARROW-7723, we found as a result of ARROW-3789 that second // through microsecond resolution tz-aware timestamps were being promoted to // use the DATETIME_NANO_TZ conversion path, yielding a datetime64[ns] NumPy // array in this function. PyArray_GETITEM returns datetime.datetime for // units second through microsecond but PyLong for nanosecond (because - // datetime.datetime does not support nanoseconds). We inserted this hack to - // preserve the <= 0.15.1 behavior until a better solution can be devised + // datetime.datetime does not support nanoseconds). + // We force the object conversion to preserve the value of the timezone. Review comment: Might be worth stating again for emphasis that nanoseconds are returned as integers inside structs (I got confused myself when I saw it because I forgot about this) ## File path: cpp/src/arrow/python/python_to_arrow.cc ## @@ -191,10 +191,11 @@ struct ValueConverter { template <> struct ValueConverter { - static inline Result FromPython(PyObject* obj, TimeUnit::type unit) { + static inline Result FromPython(PyObject* obj, TimeUnit::type unit, + bool /*ignore_timezone*/) { Review comment: This is a bit ugly, could we pass an options struct instead? ## File path: cpp/src/arrow/python/python_to_arrow.cc ## @@ -1328,6 +1357,10 @@ Status ConvertPySequence(PyObject* sequence_source, PyObject* mask, if (options.type == nullptr) { RETURN_NOT_OK(InferArrowType(seq, mask, options.from_pandas, &real_type)); +if (options.ignore_timezone && real_type->id() == Type::TIMESTAMP) { + const auto& ts_type = checked_cast(*real_type); Review comment: be consistent about const? ## File path: cpp/src/arrow/python/arrow_to_pandas.cc ## @@ -951,8 +953,21 @@ struct ObjectWriterVisitor { template enable_if_timestamp Visit(const Type& type) { const TimeUnit::type unit = type.unit(); -auto WrapValue = [unit](typename Type::c_type value, PyObject** out) { +OwnedRef tzinfo; +if (!type.timezone().empty() && !options.ignore_timezone) { + ARROW_ASSIGN_OR_RAISE(tzinfo, internal::StringToTzinfo(type.timezone())); + RETURN_IF_PYERROR(); +} +auto WrapValue = [&](typename Type::c_type value, PyObject** out) { Review comment: I'd suggest declaring two lambdas, one that is the original WrapValue and another that adds the tzinfo to the result of WrapValue ## File path: cpp/src/arrow/python/common.h ## @@ -137,6 +137,11 @@ class ARROW_PYTHON_EXPORT OwnedRef { OwnedRef(OwnedRef&& other) : OwnedRef(other.detach()) {} explicit OwnedRef(PyObject* obj) : obj_(obj) {} + OwnedRef& operator=(PyObject* obj) { +obj_ = obj; +return *this; + } Review comment: Is this definitely needed? I think it's cleaner to use the explicit reset/detach methods ## File path: cpp/src/arrow/python/datetime.h ## @@ -157,6 +157,22 @@ inline int64_t PyDelta_to_ns(PyDateTime_Delta* pytimedelta) { return PyDelta_to_us(pytimedelta) * 1000; } +ARROW_PYTHON_EXPORT Review comment: Do these need to be exported? ## File path: cpp/src/arrow/compute/kernels/scalar_string.cc ## @@ -861,10 +861,10 @@ void AddBinaryLength(FunctionRegistry* registry) { applicator::ScalarUnaryNotNull::Exec; ArrayKernelExec exec_offset_64 = applicator::ScalarUnaryNotNull::Exec; - for (const auto input_type : {binary(), utf8()}) { + for (const auto& input_type : {binary(), utf8()}) { Review comment: I agree it's good to fix but not required for the pR ## File path: python/pyarrow/tests/test_pandas.py ## @@ -3325,13 +3325,35 @@ def test_cast_timestamp_unit(): assert result.equals(expected) -def test_struct_with_timestamp_tz(): +def test_nested_with_timestamp_tz_round_trip(): +ts = pd.Timestamp.now() +ts_dt = ts.to_pydatetime() +arr = pa.array([ts_dt], type=pa.timestamp('us', tz='America/New_York')) +struct = pa.StructArray.from_arrays([arr, ar
[GitHub] [arrow] kiszk commented on pull request #7915: ARROW-9671: [C++] Fix a bug in BasicDecimal128 constructor that interprets uint64_t integers with highest bit set as negative.
kiszk commented on pull request #7915: URL: https://github.com/apache/arrow/pull/7915#issuecomment-670799483 good catch! This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] offthewall123 commented on a change in pull request #7815: ARROW-9536: [Java] Miss parameters in PlasmaOutOfMemoryException.java
offthewall123 commented on a change in pull request #7815: URL: https://github.com/apache/arrow/pull/7815#discussion_r467342682 ## File path: java/plasma/src/test/java/org/apache/arrow/plasma/PlasmaClientTest.java ## @@ -277,6 +278,20 @@ public void doByteBufferTest() { client.release(id); } + public void doPlasmaOutOfMemoryExceptionTest() { +System.out.println("Start PlasmaOutOfMemoryException test."); +PlasmaClient client = (PlasmaClient) pLink; +byte[] objectId = new byte[20]; +Arrays.fill(objectId, (byte) 1); +try { + ByteBuffer byteBuffer = client.create(objectId, 2, null); + client.seal(objectId); +} catch (PlasmaOutOfMemoryException e) { + System.out.println(String.format("Expected PlasmaOutOfMemoryException: %s", e)); Review comment: > shouldn't this have an assert? Hi @emkornfield , an Assert statement added. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] emkornfield commented on pull request #7815: ARROW-9536: [Java] Miss parameters in PlasmaOutOfMemoryException.java
emkornfield commented on pull request #7815: URL: https://github.com/apache/arrow/pull/7815#issuecomment-670812386 +1, thanks. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] emkornfield closed pull request #7815: ARROW-9536: [Java] Miss parameters in PlasmaOutOfMemoryException.java
emkornfield closed pull request #7815: URL: https://github.com/apache/arrow/pull/7815 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] emkornfield commented on a change in pull request #7885: ARROW-9640: [C++][Gandiva] Implement round() for integers and long integers
emkornfield commented on a change in pull request #7885: URL: https://github.com/apache/arrow/pull/7885#discussion_r467352487 ## File path: cpp/src/gandiva/precompiled/extended_math_ops_test.cc ## @@ -87,6 +87,19 @@ TEST(TestExtendedMathOps, TestLogWithBase) { EXPECT_EQ(context1.has_error(), false); } +TEST(TestExtendedMathOps, TestRound) { + EXPECT_EQ(round_int32(7589, -1), 7590); Review comment: what I mean is I don't think anything here test for the addition of .5, I think there are currently other values for the rounding addition that would allow these tests to pass This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] emkornfield commented on pull request #7915: ARROW-9671: [C++] Fix a bug in BasicDecimal128 constructor that interprets uint64_t integers with highest bit set as negative.
emkornfield commented on pull request #7915: URL: https://github.com/apache/arrow/pull/7915#issuecomment-670813455 It seems travis is having issues today. I'm going to go ahead and merge this. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] emkornfield closed pull request #7915: ARROW-9671: [C++] Fix a bug in BasicDecimal128 constructor that interprets uint64_t integers with highest bit set as negative.
emkornfield closed pull request #7915: URL: https://github.com/apache/arrow/pull/7915 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] emkornfield commented on a change in pull request #7816: ARROW-9528: [Python] Honor tzinfo when converting from datetime
emkornfield commented on a change in pull request #7816: URL: https://github.com/apache/arrow/pull/7816#discussion_r467353205 ## File path: ci/scripts/integration_spark.sh ## @@ -22,6 +22,9 @@ source_dir=${1} spark_dir=${2} spark_version=${SPARK_VERSION:-master} +# Use old behavior that always dropped tiemzones. +export ARROW_NO_TZ=1 Review comment: no, this also affects inference when importing non-naive datetimes and when things like timestamp_as_object are used in to pandas. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] emkornfield commented on a change in pull request #7816: ARROW-9528: [Python] Honor tzinfo when converting from datetime
emkornfield commented on a change in pull request #7816: URL: https://github.com/apache/arrow/pull/7816#discussion_r467353299 ## File path: cpp/src/arrow/python/python_to_arrow.cc ## @@ -191,10 +191,11 @@ struct ValueConverter { template <> struct ValueConverter { - static inline Result FromPython(PyObject* obj, TimeUnit::type unit) { + static inline Result FromPython(PyObject* obj, TimeUnit::type unit, + bool /*ignore_timezone*/) { Review comment: My thought here was this parameter will only be need until the next release, so it would be better to add it here and then remove it without the need to introduce options. We can introduce options though if you feel strongly. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] emkornfield commented on a change in pull request #7248: ARROW-8402: [Java] Support ValidateFull methods in Java
emkornfield commented on a change in pull request #7248: URL: https://github.com/apache/arrow/pull/7248#discussion_r467354214 ## File path: java/vector/src/main/java/org/apache/arrow/vector/validate/ValidateVectorBufferVisitor.java ## @@ -0,0 +1,224 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.validate; + +import static org.apache.arrow.vector.validate.ValidateUtility.validateOrThrow; + +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.vector.BaseFixedWidthVector; +import org.apache.arrow.vector.BaseLargeVariableWidthVector; +import org.apache.arrow.vector.BaseVariableWidthVector; +import org.apache.arrow.vector.BitVector; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.NullVector; +import org.apache.arrow.vector.TypeLayout; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.compare.VectorVisitor; +import org.apache.arrow.vector.complex.DenseUnionVector; +import org.apache.arrow.vector.complex.FixedSizeListVector; +import org.apache.arrow.vector.complex.LargeListVector; +import org.apache.arrow.vector.complex.ListVector; +import org.apache.arrow.vector.complex.NonNullableStructVector; +import org.apache.arrow.vector.complex.UnionVector; +import org.apache.arrow.vector.types.pojo.ArrowType; + +/** + * Visitor to validate vector buffers. + */ +public class ValidateVectorBufferVisitor implements VectorVisitor { + + private void validateVectorCommon(ValueVector vector) { +ArrowType arrowType = vector.getField().getType(); +validateOrThrow(vector.getValueCount() >= 0, "vector valueCount is negative"); + +if (vector instanceof FieldVector) { + FieldVector fieldVector = (FieldVector) vector; + int typeBufferCount = TypeLayout.getTypeBufferCount(arrowType); + validateOrThrow(fieldVector.getFieldBuffers().size() == typeBufferCount, + String.format("Expected %s buffers in vector of type %s, got %s", + typeBufferCount, vector.getField().getType().toString(), fieldVector.getFieldBuffers().size())); +} + } + + private void validateValidityBuffer(ValueVector vector, int valueCount) { +ArrowBuf validityBuffer = vector.getValidityBuffer(); +validateOrThrow(validityBuffer != null, "The validity buffer is null."); +validateOrThrow(validityBuffer.capacity() * 8 >= valueCount, "No enough capacity for the validity buffer."); + } + + private void validateOffsetBuffer(ValueVector vector, long minCapacity) { +ArrowBuf offsetBuffer = vector.getOffsetBuffer(); +validateOrThrow(offsetBuffer != null, "The offset buffer is null."); +validateOrThrow(offsetBuffer.capacity() >= minCapacity, "No enough capacity for the offset buffer."); + } + + private void validateFixedWidthDataBuffer(ValueVector vector, int valueCount, int bitWidth) { +ArrowBuf dataBuffer = vector.getDataBuffer(); +validateOrThrow(dataBuffer != null, "The fixed width data buffer is null."); +validateOrThrow((long) bitWidth * valueCount <= dataBuffer.capacity() * 8L, +"No enough capacity for fixed width data buffer"); Review comment: for most error messages it would be useful to include actual numbers. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] emkornfield commented on a change in pull request #7326: ARROW-9010: [Java] Framework and interface changes for RecordBatch IPC buffer compression
emkornfield commented on a change in pull request #7326: URL: https://github.com/apache/arrow/pull/7326#discussion_r467356218 ## File path: java/vector/src/main/java/org/apache/arrow/vector/compression/CompressionUtil.java ## @@ -0,0 +1,60 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.compression; + +import org.apache.arrow.flatbuf.BodyCompressionMethod; +import org.apache.arrow.flatbuf.CompressionType; +import org.apache.arrow.vector.ipc.message.ArrowBodyCompression; + +/** + * Utilities for data compression/decompression. + */ +public class CompressionUtil { + + private CompressionUtil() { + } + + /** + * Creates the {@link ArrowBodyCompression} object, given the {@link CompressionCodec}. + * The implementation of this method should depend on the values of {@link CompressionType#names}. + */ + public static ArrowBodyCompression createBodyCompression(CompressionCodec codec) { +switch (codec.getCodecName()) { + case "default": +return DefaultCompressionCodec.DEFAULT_BODY_COMPRESSION; + case "LZ4_FRAME": +return new ArrowBodyCompression((byte) 0, BodyCompressionMethod.BUFFER); Review comment: these isn't an enum of 0 and 1 below? ## File path: java/vector/src/main/java/org/apache/arrow/vector/compression/DefaultCompressionCodec.java ## @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.compression; + +import org.apache.arrow.flatbuf.BodyCompressionMethod; +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.ipc.message.ArrowBodyCompression; + +/** + * The default compression codec that does no compression. + */ +public class DefaultCompressionCodec implements CompressionCodec { Review comment: nit: I think NoCompressionCodec or something similar to indicate that no compression/decompression happens might this easier to understand for future readers. ## File path: java/vector/src/main/java/org/apache/arrow/vector/VectorUnloader.java ## @@ -76,6 +97,10 @@ private void appendNodes(FieldVector vector, List nodes, Listhttp://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.compression; + +import org.apache.arrow.flatbuf.BodyCompressionMethod; +import org.apache.arrow.memory.ArrowBuf; +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.ipc.message.ArrowBodyCompression; + +/** + * The default compression codec that does no compression. + */ +public class DefaultCompressionCodec implements CompressionCodec { + + public static final DefaultCompressionCodec INSTANCE = new DefaultCompressionCodec(); + + public static final byte COMPRESSION_TYPE = -1; + + public static final ArrowBodyCompression DEFAULT_BODY_COMPRESSION = + new ArrowBodyCompression(COMPRESSION_TYPE, BodyCompressionMethod.BUFFER); + + private DefaultCompressionCodec() { + } + + @Override + public ArrowBuf compress(BufferAllocator allocator, ArrowBuf unCompressedBuffer) { Review
[GitHub] [arrow] emkornfield commented on a change in pull request #7817: ARROW-9377: [Java] Support unsigned dictionary indices
emkornfield commented on a change in pull request #7817: URL: https://github.com/apache/arrow/pull/7817#discussion_r467357784 ## File path: java/vector/src/test/java/org/apache/arrow/vector/ipc/TestUIntDictionaryRoundTrip.java ## @@ -0,0 +1,207 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.ipc; + +import static org.apache.arrow.vector.testing.ValueVectorDataPopulator.setVector; +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.nio.channels.Channels; +import java.util.Arrays; +import java.util.Collection; +import java.util.Map; +import java.util.function.ToIntBiFunction; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.UInt1Vector; +import org.apache.arrow.vector.UInt2Vector; +import org.apache.arrow.vector.UInt4Vector; +import org.apache.arrow.vector.UInt8Vector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.dictionary.Dictionary; +import org.apache.arrow.vector.dictionary.DictionaryProvider; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.DictionaryEncoding; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.types.pojo.Schema; +import org.apache.arrow.vector.util.ByteArrayReadableSeekableByteChannel; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +/** + * Test the round-trip of dictionary encoding, + * with unsigned integer as indices. + */ +@RunWith(Parameterized.class) +public class TestUIntDictionaryRoundTrip { + + private final boolean streamMode; + + public TestUIntDictionaryRoundTrip(boolean streamMode) { +this.streamMode = streamMode; + } + + private BufferAllocator allocator; + + private VarCharVector dictionaryVector; + + private DictionaryProvider.MapDictionaryProvider dictionaryProvider; + + @Before + public void init() { +allocator = new RootAllocator(Long.MAX_VALUE); +dictionaryVector = new VarCharVector("dict vector", allocator); +setVector(dictionaryVector, "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"); + +dictionaryProvider = new DictionaryProvider.MapDictionaryProvider(); + } + + @After + public void terminate() throws Exception { +dictionaryVector.close(); +allocator.close(); + } + + private byte[] writeData(FieldVector encodedVector) throws IOException { +ByteArrayOutputStream out = new ByteArrayOutputStream(); +VectorSchemaRoot root = +new VectorSchemaRoot( +Arrays.asList(encodedVector.getField()), Arrays.asList(encodedVector), encodedVector.getValueCount()); +try (ArrowWriter writer = streamMode ? +new ArrowStreamWriter(root, dictionaryProvider, out) : +new ArrowFileWriter(root, dictionaryProvider, Channels.newChannel(out))) { + writer.start(); + writer.writeBatch(); + writer.end(); + + return out.toByteArray(); +} + } + + private void readData( + byte[] data, + Field expectedField, + ToIntBiFunction valGetter, + long dictionaryID) throws IOException { +try (ArrowReader reader = streamMode ? + new ArrowStreamReader(new ByteArrayInputStream(data), allocator) : + new ArrowFileReader(new SeekableReadChannel(new ByteArrayReadableSeekableByteChannel(data)), allocator)) { + + // verify schema + Schema readSchema = reader.getVectorSchemaRoot().getSchema(); + assertEquals(1, readSchema.getFields().size()); + assertEquals(expectedField, readSchema.getFields().get(0)); + + // verify vector schema root + assertTrue(reader.loadNextBatch()); +
[GitHub] [arrow] emkornfield commented on a change in pull request #7817: ARROW-9377: [Java] Support unsigned dictionary indices
emkornfield commented on a change in pull request #7817: URL: https://github.com/apache/arrow/pull/7817#discussion_r467357953 ## File path: java/vector/src/test/java/org/apache/arrow/vector/ipc/TestUIntDictionaryRoundTrip.java ## @@ -0,0 +1,207 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.vector.ipc; + +import static org.apache.arrow.vector.testing.ValueVectorDataPopulator.setVector; +import static org.junit.Assert.assertArrayEquals; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertTrue; + +import java.io.ByteArrayInputStream; +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.nio.channels.Channels; +import java.util.Arrays; +import java.util.Collection; +import java.util.Map; +import java.util.function.ToIntBiFunction; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.FieldVector; +import org.apache.arrow.vector.UInt1Vector; +import org.apache.arrow.vector.UInt2Vector; +import org.apache.arrow.vector.UInt4Vector; +import org.apache.arrow.vector.UInt8Vector; +import org.apache.arrow.vector.ValueVector; +import org.apache.arrow.vector.VarCharVector; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.dictionary.Dictionary; +import org.apache.arrow.vector.dictionary.DictionaryProvider; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.DictionaryEncoding; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.types.pojo.Schema; +import org.apache.arrow.vector.util.ByteArrayReadableSeekableByteChannel; +import org.junit.After; +import org.junit.Before; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +/** + * Test the round-trip of dictionary encoding, + * with unsigned integer as indices. + */ +@RunWith(Parameterized.class) +public class TestUIntDictionaryRoundTrip { + + private final boolean streamMode; + + public TestUIntDictionaryRoundTrip(boolean streamMode) { +this.streamMode = streamMode; + } + + private BufferAllocator allocator; + + private VarCharVector dictionaryVector; + + private DictionaryProvider.MapDictionaryProvider dictionaryProvider; + + @Before + public void init() { +allocator = new RootAllocator(Long.MAX_VALUE); +dictionaryVector = new VarCharVector("dict vector", allocator); +setVector(dictionaryVector, "0", "1", "2", "3", "4", "5", "6", "7", "8", "9"); Review comment: it might be nice to verify dictionary for at least uint1 work for vectors of length > 127 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] wqc200 opened a new pull request #7916: ARROW-9673: [Rust] [DataFusion] Add a param "dialect" for DFParser::parse_sql
wqc200 opened a new pull request #7916: URL: https://github.com/apache/arrow/pull/7916 i use func 'DFParser::parse_sql' in outsie app with mysql dialect, but parser_sql use generic dialect default, we need to add a parameter so that we can pass in the dialect. thanks. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] github-actions[bot] commented on pull request #7916: ARROW-9673: [Rust] [DataFusion] Add a param "dialect" for DFParser::parse_sql
github-actions[bot] commented on pull request #7916: URL: https://github.com/apache/arrow/pull/7916#issuecomment-670828493 https://issues.apache.org/jira/browse/ARROW-9673 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org