[GitHub] [arrow] emkornfield commented on a change in pull request #7030: ARROW-7808: [Java][Dataset] Implement Datasets Java API
emkornfield commented on a change in pull request #7030: URL: https://github.com/apache/arrow/pull/7030#discussion_r414990082 ## File path: cpp/src/jni/dataset/jni_wrapper.cpp ## @@ -0,0 +1,577 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "concurrent_map.h" +#include "arrow/compute/kernel.h" +#include "arrow/compute/kernels/cast.h" +#include "arrow/compute/kernels/compare.h" +#include "jni/dataset/Types.pb.h" + +#include "org_apache_arrow_dataset_jni_JniWrapper.h" +#include "org_apache_arrow_dataset_file_JniWrapper.h" + +static jclass illegal_access_exception_class; +static jclass illegal_argument_exception_class; +static jclass runtime_exception_class; + +static jclass record_batch_handle_class; +static jclass record_batch_handle_field_class; +static jclass record_batch_handle_buffer_class; + +static jmethodID record_batch_handle_constructor; +static jmethodID record_batch_handle_field_constructor; +static jmethodID record_batch_handle_buffer_constructor; + +static jint JNI_VERSION = JNI_VERSION_1_6; + +using arrow::jni::ConcurrentMap; + +static ConcurrentMap> dataset_factory_holder_; +static ConcurrentMap> dataset_holder_; +static ConcurrentMap> scan_task_holder_; +static ConcurrentMap> scanner_holder_; +static ConcurrentMap> iterator_holder_; +static ConcurrentMap> buffer_holder_; + +#define JNI_ASSIGN_OR_THROW_NAME(x, y) ARROW_CONCAT(x, y) + +#define JNI_ASSIGN_OR_THROW_IMPL(t, lhs, rexpr) \ + auto t = (rexpr); \ + if (!t.status().ok()) { \ +env->ThrowNew(runtime_exception_class, t.status().message().c_str()); \ + } \ + lhs = std::move(t).ValueOrDie(); + +#define JNI_ASSIGN_OR_THROW(lhs, rexpr) \ + JNI_ASSIGN_OR_THROW_IMPL(JNI_ASSIGN_OR_THROW_NAME(_tmp_var, __COUNTER__), lhs, rexpr) + +#define JNI_ASSERT_OK_OR_THROW(expr) \ + do {\ +auto _res = (expr); \ +arrow::Status _st = ::arrow::internal::GenericToStatus(_res); \ +if (!_st.ok()) { \ + env->ThrowNew(runtime_exception_class, _st.message().c_str()); \ +} \ + } while (false); + +jclass CreateGlobalClassReference(JNIEnv* env, const char* class_name) { + jclass local_class = env->FindClass(class_name); + jclass global_class = (jclass)env->NewGlobalRef(local_class); + env->DeleteLocalRef(local_class); + return global_class; +} + +jmethodID GetMethodID(JNIEnv* env, jclass this_class, const char* name, const char* sig) { + jmethodID ret = env->GetMethodID(this_class, name, sig); + if (ret == nullptr) { +std::string error_message = "Unable to find method " + std::string(name) + +" within signature" + std::string(sig); +env->ThrowNew(illegal_access_exception_class, error_message.c_str()); + } + return ret; +} + +jint JNI_OnLoad(JavaVM* vm, void* reserved) { + JNIEnv* env; + if (vm->GetEnv(reinterpret_cast(&env), JNI_VERSION) != JNI_OK) { +return JNI_ERR; + } + + illegal_access_exception_class = + CreateGlobalClassReference(env, "Ljava/lang/IllegalAccessException;"); + illegal_argument_exception_class = + CreateGlobalClassReference(env, "Ljava/lang/IllegalArgumentException;"); + runtime_exception_class = + CreateGlobalClassReference(env, "Ljava/lang/RuntimeException;"); + + record_batch_handle_class = + CreateGlobalClassReference(env, "Lorg/apache/arrow/dataset/jni/NativeRecordBatchHandle;"); + record_batch_handle_field_class = + CreateGlobalClassReference(env, "Lorg/apache/arrow/dataset/jni/NativeRecordBatchHandle$Field;"); + record_batch_handle_buffer_class = + CreateGlobalClassReference(env, "Lorg/apache/arrow/dataset/
[GitHub] [arrow] emkornfield commented on a change in pull request #7030: ARROW-7808: [Java][Dataset] Implement Datasets Java API
emkornfield commented on a change in pull request #7030: URL: https://github.com/apache/arrow/pull/7030#discussion_r414989937 ## File path: cpp/src/jni/dataset/jni_wrapper.cpp ## @@ -0,0 +1,577 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "concurrent_map.h" +#include "arrow/compute/kernel.h" +#include "arrow/compute/kernels/cast.h" +#include "arrow/compute/kernels/compare.h" +#include "jni/dataset/Types.pb.h" + +#include "org_apache_arrow_dataset_jni_JniWrapper.h" +#include "org_apache_arrow_dataset_file_JniWrapper.h" + +static jclass illegal_access_exception_class; +static jclass illegal_argument_exception_class; +static jclass runtime_exception_class; + +static jclass record_batch_handle_class; +static jclass record_batch_handle_field_class; +static jclass record_batch_handle_buffer_class; + +static jmethodID record_batch_handle_constructor; +static jmethodID record_batch_handle_field_constructor; +static jmethodID record_batch_handle_buffer_constructor; + +static jint JNI_VERSION = JNI_VERSION_1_6; + +using arrow::jni::ConcurrentMap; + +static ConcurrentMap> dataset_factory_holder_; +static ConcurrentMap> dataset_holder_; +static ConcurrentMap> scan_task_holder_; +static ConcurrentMap> scanner_holder_; +static ConcurrentMap> iterator_holder_; +static ConcurrentMap> buffer_holder_; + +#define JNI_ASSIGN_OR_THROW_NAME(x, y) ARROW_CONCAT(x, y) + +#define JNI_ASSIGN_OR_THROW_IMPL(t, lhs, rexpr) \ + auto t = (rexpr); \ + if (!t.status().ok()) { \ +env->ThrowNew(runtime_exception_class, t.status().message().c_str()); \ + } \ + lhs = std::move(t).ValueOrDie(); + +#define JNI_ASSIGN_OR_THROW(lhs, rexpr) \ + JNI_ASSIGN_OR_THROW_IMPL(JNI_ASSIGN_OR_THROW_NAME(_tmp_var, __COUNTER__), lhs, rexpr) + +#define JNI_ASSERT_OK_OR_THROW(expr) \ + do {\ +auto _res = (expr); \ +arrow::Status _st = ::arrow::internal::GenericToStatus(_res); \ +if (!_st.ok()) { \ + env->ThrowNew(runtime_exception_class, _st.message().c_str()); \ +} \ + } while (false); + +jclass CreateGlobalClassReference(JNIEnv* env, const char* class_name) { + jclass local_class = env->FindClass(class_name); + jclass global_class = (jclass)env->NewGlobalRef(local_class); + env->DeleteLocalRef(local_class); + return global_class; +} + +jmethodID GetMethodID(JNIEnv* env, jclass this_class, const char* name, const char* sig) { + jmethodID ret = env->GetMethodID(this_class, name, sig); + if (ret == nullptr) { +std::string error_message = "Unable to find method " + std::string(name) + +" within signature" + std::string(sig); +env->ThrowNew(illegal_access_exception_class, error_message.c_str()); + } + return ret; +} + +jint JNI_OnLoad(JavaVM* vm, void* reserved) { + JNIEnv* env; + if (vm->GetEnv(reinterpret_cast(&env), JNI_VERSION) != JNI_OK) { +return JNI_ERR; + } + + illegal_access_exception_class = + CreateGlobalClassReference(env, "Ljava/lang/IllegalAccessException;"); + illegal_argument_exception_class = + CreateGlobalClassReference(env, "Ljava/lang/IllegalArgumentException;"); + runtime_exception_class = + CreateGlobalClassReference(env, "Ljava/lang/RuntimeException;"); + + record_batch_handle_class = + CreateGlobalClassReference(env, "Lorg/apache/arrow/dataset/jni/NativeRecordBatchHandle;"); + record_batch_handle_field_class = + CreateGlobalClassReference(env, "Lorg/apache/arrow/dataset/jni/NativeRecordBatchHandle$Field;"); + record_batch_handle_buffer_class = + CreateGlobalClassReference(env, "Lorg/apache/arrow/dataset/
[GitHub] [arrow] emkornfield commented on a change in pull request #7030: ARROW-7808: [Java][Dataset] Implement Datasets Java API
emkornfield commented on a change in pull request #7030: URL: https://github.com/apache/arrow/pull/7030#discussion_r414989258 ## File path: java/dataset/src/test/java/org/apache/arrow/dataset/jni/NativeDatasetTest.java ## @@ -0,0 +1,209 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.dataset.jni; + +import java.io.File; +import java.util.Iterator; +import java.util.List; +import java.util.Spliterator; +import java.util.Spliterators; +import java.util.stream.Collectors; +import java.util.stream.StreamSupport; + +import org.apache.arrow.dataset.DatasetTypes; +import org.apache.arrow.dataset.file.FileFormat; +import org.apache.arrow.dataset.file.FileSystem; +import org.apache.arrow.dataset.file.SingleFileDatasetFactory; +import org.apache.arrow.dataset.filter.Filter; +import org.apache.arrow.dataset.filter.FilterImpl; +import org.apache.arrow.dataset.scanner.ScanOptions; +import org.apache.arrow.dataset.scanner.ScanTask; +import org.apache.arrow.dataset.scanner.Scanner; +import org.apache.arrow.dataset.source.Dataset; +import org.apache.arrow.dataset.source.DatasetFactory; +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.VectorSchemaRoot; +import org.apache.arrow.vector.types.pojo.Schema; +import org.junit.Assert; +import org.junit.Ignore; +import org.junit.Test; + +public class NativeDatasetTest { + + private String sampleParquetPath() { +return NativeDatasetTest.class.getResource(File.separator + "userdata.parquet").getPath(); Review comment: please don't check the binary file into this repo. It would be better to generate a file on the fly, so readers of the test can understand what data exists there. If a standalone parquet file is really needed [arrow-testing](https://github.com/apache/arrow-testing) is the place to put it This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] emkornfield commented on a change in pull request #7030: ARROW-7808: [Java][Dataset] Implement Datasets Java API
emkornfield commented on a change in pull request #7030: URL: https://github.com/apache/arrow/pull/7030#discussion_r414988831 ## File path: java/pom.xml ## @@ -369,24 +369,24 @@ org.apache.maven.plugins maven-compiler-plugin 3.6.2 - - - -XDcompilePolicy=simple - -Xplugin:ErrorProne - - - -com.google.errorprone -error_prone_core -2.3.3 - - -org.immutables -value -2.8.2 - - - + Review comment: revert this? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] emkornfield commented on a change in pull request #7030: ARROW-7808: [Java][Dataset] Implement Datasets Java API
emkornfield commented on a change in pull request #7030: URL: https://github.com/apache/arrow/pull/7030#discussion_r414988755 ## File path: java/dataset/src/test/java/org/apache/arrow/util/SchemaUtilsTest.java ## @@ -0,0 +1,50 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.util; + +import static java.util.Arrays.asList; + +import java.io.IOException; + +import org.apache.arrow.memory.RootAllocator; +import org.apache.arrow.vector.types.pojo.ArrowType; +import org.apache.arrow.vector.types.pojo.Field; +import org.apache.arrow.vector.types.pojo.FieldType; +import org.apache.arrow.vector.types.pojo.Schema; +import org.junit.Assert; +import org.junit.Test; + +public class SchemaUtilsTest { + + private static Field field(String name, boolean nullable, ArrowType type, Field... children) { +return new Field(name, new FieldType(nullable, type, null, null), asList(children)); + } + + @Test + public void testSerializationAndDeserialization() throws IOException { +Schema schema = new Schema(asList( +field("a", false, new ArrowType.Null()), +field("b", true, new ArrowType.Utf8()), +field("c", true, new ArrowType.Binary())) +); + +byte[] serialized = SchemaUtils.get().serialize(schema); +Schema deserialized = SchemaUtils.get().deserialize(serialized, new RootAllocator(Long.MAX_VALUE)); +Assert.assertEquals(schema, deserialized); Review comment: generally, I think we static import assertEquals. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] emkornfield commented on a change in pull request #7030: ARROW-7808: [Java][Dataset] Implement Datasets Java API
emkornfield commented on a change in pull request #7030: URL: https://github.com/apache/arrow/pull/7030#discussion_r414988474 ## File path: java/dataset/src/main/java/org/apache/arrow/util/SchemaUtils.java ## @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.util; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.nio.channels.Channels; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.ipc.ReadChannel; +import org.apache.arrow.vector.ipc.WriteChannel; +import org.apache.arrow.vector.ipc.message.MessageChannelReader; +import org.apache.arrow.vector.ipc.message.MessageResult; +import org.apache.arrow.vector.ipc.message.MessageSerializer; +import org.apache.arrow.vector.types.pojo.Schema; +import org.apache.arrow.vector.util.ByteArrayReadableSeekableByteChannel; + +/** + * Schema utility class including serialization and deserialization. + */ +public class SchemaUtils { + private static final SchemaUtils INSTANCE = new SchemaUtils(); + + public static SchemaUtils get() { +return INSTANCE; + } + + private SchemaUtils() { + + } + + /** + * Deserialize Arrow schema from byte array. + */ + public Schema deserialize(byte[] bytes, BufferAllocator allocator) throws IOException { Review comment: why not make these static methods? instead of using a singleton pattern? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] emkornfield commented on a change in pull request #7030: ARROW-7808: [Java][Dataset] Implement Datasets Java API
emkornfield commented on a change in pull request #7030: URL: https://github.com/apache/arrow/pull/7030#discussion_r414988443 ## File path: java/dataset/src/main/java/org/apache/arrow/util/SchemaUtils.java ## @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.util; + +import java.io.ByteArrayOutputStream; +import java.io.IOException; +import java.nio.channels.Channels; + +import org.apache.arrow.memory.BufferAllocator; +import org.apache.arrow.vector.ipc.ReadChannel; +import org.apache.arrow.vector.ipc.WriteChannel; +import org.apache.arrow.vector.ipc.message.MessageChannelReader; +import org.apache.arrow.vector.ipc.message.MessageResult; +import org.apache.arrow.vector.ipc.message.MessageSerializer; +import org.apache.arrow.vector.types.pojo.Schema; +import org.apache.arrow.vector.util.ByteArrayReadableSeekableByteChannel; + +/** + * Schema utility class including serialization and deserialization. + */ +public class SchemaUtils { Review comment: this probably belongs in the vector package. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] emkornfield commented on a change in pull request #7030: ARROW-7808: [Java][Dataset] Implement Datasets Java API
emkornfield commented on a change in pull request #7030: URL: https://github.com/apache/arrow/pull/7030#discussion_r414988086 ## File path: java/dataset/src/main/java/org/apache/arrow/memory/NativeUnderlingMemory.java ## @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.memory; + +import org.apache.arrow.dataset.jni.JniWrapper; + +/** + * AllocationManager implementation for Native allocated memory. + */ +public class NativeUnderlingMemory extends AllocationManager { Review comment: this should probably be in a different common package if we want to wrap the C++ allocator. It isn't clear to me that we should do this because we lose java native heap caps IIUC. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] emkornfield commented on a change in pull request #7030: ARROW-7808: [Java][Dataset] Implement Datasets Java API
emkornfield commented on a change in pull request #7030: URL: https://github.com/apache/arrow/pull/7030#discussion_r414987860 ## File path: java/dataset/src/main/java/org/apache/arrow/dataset/source/DatasetFactory.java ## @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.dataset.source; + +import org.apache.arrow.vector.types.pojo.Schema; + +/** + * DataSourceDiscovery provides a way to inspect a DataSource potential + * schema before materializing it. Thus, the user can peek the schema for + * data sources and decide on a unified schema. + */ +public interface DatasetFactory { + + Schema inspect(); + Review comment: javadoc. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] emkornfield commented on a change in pull request #7030: ARROW-7808: [Java][Dataset] Implement Datasets Java API
emkornfield commented on a change in pull request #7030: URL: https://github.com/apache/arrow/pull/7030#discussion_r414987685 ## File path: java/dataset/src/main/java/org/apache/arrow/dataset/scanner/ScanTask.java ## @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.dataset.scanner; + +import java.util.Iterator; + +import org.apache.arrow.vector.VectorSchemaRoot; + +/** + * Read record batches from a range of a single data fragment. A + * ScanTask is meant to be a unit of work to be dispatched. The implementation + * must be thread and concurrent safe. + */ +public interface ScanTask { + + /** + * Creates and returns a {@link Itr} instance. + */ + Itr scan(); + + /** + * The iterator implementation for {@link VectorSchemaRoot}s. + */ + interface Itr extends Iterator, AutoCloseable { Review comment: can we call this "Iterator"? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] emkornfield commented on a change in pull request #7030: ARROW-7808: [Java][Dataset] Implement Datasets Java API
emkornfield commented on a change in pull request #7030: URL: https://github.com/apache/arrow/pull/7030#discussion_r414987799 ## File path: java/dataset/src/main/java/org/apache/arrow/dataset/scanner/ScanTask.java ## @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.dataset.scanner; + +import java.util.Iterator; + +import org.apache.arrow.vector.VectorSchemaRoot; + +/** + * Read record batches from a range of a single data fragment. A + * ScanTask is meant to be a unit of work to be dispatched. The implementation + * must be thread and concurrent safe. + */ +public interface ScanTask { + + /** + * Creates and returns a {@link Itr} instance. + */ + Itr scan(); + + /** + * The iterator implementation for {@link VectorSchemaRoot}s. + */ + interface Itr extends Iterator, AutoCloseable { +// FIXME VectorSchemaRoot is not actually something ITERABLE. Using a reader convention instead. Review comment: Fix this? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] emkornfield commented on a change in pull request #7030: ARROW-7808: [Java][Dataset] Implement Datasets Java API
emkornfield commented on a change in pull request #7030: URL: https://github.com/apache/arrow/pull/7030#discussion_r414987154 ## File path: java/dataset/src/main/java/org/apache/arrow/dataset/jni/JniWrapper.java ## @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.dataset.jni; + +/** + * JNI wrapper for Datasets API's native implementation. + */ +public class JniWrapper { + + private static final JniWrapper INSTANCE = new JniWrapper(); + + public static JniWrapper get() { +return INSTANCE; + } + + private JniWrapper() { +JniLoader.get().ensureLoaded(); + } + + public native void closeDatasetFactory(long datasetFactoryId); Review comment: these methods need javadoc. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] emkornfield commented on a change in pull request #7030: ARROW-7808: [Java][Dataset] Implement Datasets Java API
emkornfield commented on a change in pull request #7030: URL: https://github.com/apache/arrow/pull/7030#discussion_r414986958 ## File path: java/dataset/src/main/java/org/apache/arrow/dataset/filter/Filter.java ## @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.dataset.filter; + +// todo filter tree implementation Review comment: leave this out if it isn't implemented. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] emkornfield commented on a change in pull request #7030: ARROW-7808: [Java][Dataset] Implement Datasets Java API
emkornfield commented on a change in pull request #7030: URL: https://github.com/apache/arrow/pull/7030#discussion_r414986841 ## File path: java/dataset/src/main/java/org/apache/arrow/dataset/file/JniWrapper.java ## @@ -0,0 +1,39 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + *http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.arrow.dataset.file; + +import org.apache.arrow.dataset.jni.JniLoader; + +/** + * JniWrapper for filesystem based {@link org.apache.arrow.dataset.source.Dataset} implementations. + */ +public class JniWrapper { + Review comment: private constructor? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] emkornfield commented on a change in pull request #7030: ARROW-7808: [Java][Dataset] Implement Datasets Java API
emkornfield commented on a change in pull request #7030: URL: https://github.com/apache/arrow/pull/7030#discussion_r414986335 ## File path: cpp/src/jni/dataset/proto/Types.proto ## @@ -0,0 +1,149 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +syntax = "proto2"; +package types; + +option java_package = "org.apache.arrow.dataset"; Review comment: why is this file being introduced? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] emkornfield commented on a change in pull request #7030: ARROW-7808: [Java][Dataset] Implement Datasets Java API
emkornfield commented on a change in pull request #7030: URL: https://github.com/apache/arrow/pull/7030#discussion_r414985230 ## File path: cpp/src/jni/dataset/jni_wrapper.cpp ## @@ -0,0 +1,577 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "concurrent_map.h" +#include "arrow/compute/kernel.h" +#include "arrow/compute/kernels/cast.h" +#include "arrow/compute/kernels/compare.h" +#include "jni/dataset/Types.pb.h" + +#include "org_apache_arrow_dataset_jni_JniWrapper.h" +#include "org_apache_arrow_dataset_file_JniWrapper.h" + +static jclass illegal_access_exception_class; +static jclass illegal_argument_exception_class; +static jclass runtime_exception_class; + +static jclass record_batch_handle_class; +static jclass record_batch_handle_field_class; +static jclass record_batch_handle_buffer_class; + +static jmethodID record_batch_handle_constructor; +static jmethodID record_batch_handle_field_constructor; +static jmethodID record_batch_handle_buffer_constructor; + +static jint JNI_VERSION = JNI_VERSION_1_6; + +using arrow::jni::ConcurrentMap; + +static ConcurrentMap> dataset_factory_holder_; Review comment: this still strikes me as a bad pattern. https://stackoverflow.com/questions/26375215/c-shared-ptr-and-java-native-object-ownership/26377515 seems better. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] emkornfield commented on a change in pull request #7030: ARROW-7808: [Java][Dataset] Implement Datasets Java API
emkornfield commented on a change in pull request #7030: URL: https://github.com/apache/arrow/pull/7030#discussion_r414984900 ## File path: cpp/src/jni/dataset/jni_wrapper.cpp ## @@ -0,0 +1,577 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "concurrent_map.h" +#include "arrow/compute/kernel.h" +#include "arrow/compute/kernels/cast.h" +#include "arrow/compute/kernels/compare.h" +#include "jni/dataset/Types.pb.h" + +#include "org_apache_arrow_dataset_jni_JniWrapper.h" +#include "org_apache_arrow_dataset_file_JniWrapper.h" + +static jclass illegal_access_exception_class; +static jclass illegal_argument_exception_class; +static jclass runtime_exception_class; + +static jclass record_batch_handle_class; +static jclass record_batch_handle_field_class; +static jclass record_batch_handle_buffer_class; + +static jmethodID record_batch_handle_constructor; +static jmethodID record_batch_handle_field_constructor; +static jmethodID record_batch_handle_buffer_constructor; + +static jint JNI_VERSION = JNI_VERSION_1_6; + +using arrow::jni::ConcurrentMap; + +static ConcurrentMap> dataset_factory_holder_; +static ConcurrentMap> dataset_holder_; +static ConcurrentMap> scan_task_holder_; +static ConcurrentMap> scanner_holder_; +static ConcurrentMap> iterator_holder_; +static ConcurrentMap> buffer_holder_; + +#define JNI_ASSIGN_OR_THROW_NAME(x, y) ARROW_CONCAT(x, y) + +#define JNI_ASSIGN_OR_THROW_IMPL(t, lhs, rexpr) \ + auto t = (rexpr); \ + if (!t.status().ok()) { \ +env->ThrowNew(runtime_exception_class, t.status().message().c_str()); \ + } \ + lhs = std::move(t).ValueOrDie(); + +#define JNI_ASSIGN_OR_THROW(lhs, rexpr) \ + JNI_ASSIGN_OR_THROW_IMPL(JNI_ASSIGN_OR_THROW_NAME(_tmp_var, __COUNTER__), lhs, rexpr) + +#define JNI_ASSERT_OK_OR_THROW(expr) \ + do {\ +auto _res = (expr); \ +arrow::Status _st = ::arrow::internal::GenericToStatus(_res); \ +if (!_st.ok()) { \ + env->ThrowNew(runtime_exception_class, _st.message().c_str()); \ +} \ + } while (false); + +jclass CreateGlobalClassReference(JNIEnv* env, const char* class_name) { + jclass local_class = env->FindClass(class_name); + jclass global_class = (jclass)env->NewGlobalRef(local_class); + env->DeleteLocalRef(local_class); + return global_class; +} + +jmethodID GetMethodID(JNIEnv* env, jclass this_class, const char* name, const char* sig) { + jmethodID ret = env->GetMethodID(this_class, name, sig); + if (ret == nullptr) { +std::string error_message = "Unable to find method " + std::string(name) + +" within signature" + std::string(sig); +env->ThrowNew(illegal_access_exception_class, error_message.c_str()); + } + return ret; +} + +jint JNI_OnLoad(JavaVM* vm, void* reserved) { + JNIEnv* env; + if (vm->GetEnv(reinterpret_cast(&env), JNI_VERSION) != JNI_OK) { +return JNI_ERR; + } + + illegal_access_exception_class = + CreateGlobalClassReference(env, "Ljava/lang/IllegalAccessException;"); + illegal_argument_exception_class = + CreateGlobalClassReference(env, "Ljava/lang/IllegalArgumentException;"); + runtime_exception_class = + CreateGlobalClassReference(env, "Ljava/lang/RuntimeException;"); + + record_batch_handle_class = + CreateGlobalClassReference(env, "Lorg/apache/arrow/dataset/jni/NativeRecordBatchHandle;"); + record_batch_handle_field_class = + CreateGlobalClassReference(env, "Lorg/apache/arrow/dataset/jni/NativeRecordBatchHandle$Field;"); + record_batch_handle_buffer_class = + CreateGlobalClassReference(env, "Lorg/apache/arrow/dataset/
[GitHub] [arrow] emkornfield commented on a change in pull request #7030: ARROW-7808: [Java][Dataset] Implement Datasets Java API
emkornfield commented on a change in pull request #7030: URL: https://github.com/apache/arrow/pull/7030#discussion_r414984380 ## File path: cpp/src/jni/dataset/jni_wrapper.cpp ## @@ -0,0 +1,577 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "concurrent_map.h" +#include "arrow/compute/kernel.h" +#include "arrow/compute/kernels/cast.h" +#include "arrow/compute/kernels/compare.h" +#include "jni/dataset/Types.pb.h" + +#include "org_apache_arrow_dataset_jni_JniWrapper.h" +#include "org_apache_arrow_dataset_file_JniWrapper.h" + +static jclass illegal_access_exception_class; +static jclass illegal_argument_exception_class; +static jclass runtime_exception_class; + +static jclass record_batch_handle_class; +static jclass record_batch_handle_field_class; +static jclass record_batch_handle_buffer_class; + +static jmethodID record_batch_handle_constructor; +static jmethodID record_batch_handle_field_constructor; +static jmethodID record_batch_handle_buffer_constructor; + +static jint JNI_VERSION = JNI_VERSION_1_6; + +using arrow::jni::ConcurrentMap; + +static ConcurrentMap> dataset_factory_holder_; +static ConcurrentMap> dataset_holder_; +static ConcurrentMap> scan_task_holder_; +static ConcurrentMap> scanner_holder_; +static ConcurrentMap> iterator_holder_; +static ConcurrentMap> buffer_holder_; + +#define JNI_ASSIGN_OR_THROW_NAME(x, y) ARROW_CONCAT(x, y) Review comment: I don't think these macros are necessary. you should be able to use a normal (templated) function since you throw an exception instead of return. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] emkornfield commented on a change in pull request #7030: ARROW-7808: [Java][Dataset] Implement Datasets Java API
emkornfield commented on a change in pull request #7030: URL: https://github.com/apache/arrow/pull/7030#discussion_r414983499 ## File path: cpp/src/jni/dataset/concurrent_map.h ## @@ -0,0 +1,81 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + */ + +#ifndef JNI_ID_TO_MODULE_MAP_H +#define JNI_ID_TO_MODULE_MAP_H + +#include +#include +#include +#include + +#include "jni.h" +#include "arrow/util/macros.h" + +namespace arrow { Review comment: Please do not copy and paste this file, factor it out to a common place that can be used by all JNI implementations. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] emkornfield commented on a change in pull request #7025: ARROW-2260: [C++][Plasma] Use Gflags for command-line parsing
emkornfield commented on a change in pull request #7025: URL: https://github.com/apache/arrow/pull/7025#discussion_r414975521 ## File path: cpp/src/plasma/store.cc ## @@ -1207,65 +1211,77 @@ void StartServer(char* socket_name, std::string plasma_directory, bool hugepages g_runner->Start(socket_name, plasma_directory, hugepages_enabled, external_store); } +void UsageError(const char* error_msg, int exit_code=1) { + std::cerr << gflags::ProgramInvocationShortName() << ": " << error_msg << std::endl; + exit(exit_code); +} + } // namespace plasma +#ifdef __linux__ +#define SHM_DEFAULT_PATH "/dev/shm" +#else +#define SHM_DEFAULT_PATH "/tmp" +#endif + +// Command-line flags. +DEFINE_string(d, SHM_DEFAULT_PATH, "directory where to create the memory-backed file"); +DEFINE_string(e, "", "endpoint for external storage service, where objects " +"evicted from Plasma store can be written to, optional"); +DEFINE_bool(h, false, "whether to enable hugepage support"); +DEFINE_string(s, "", "socket name where the Plasma store will listen for requests, required"); +DEFINE_string(m, "", "amount of memory in bytes to use for Plasma store, required"); + int main(int argc, char* argv[]) { ArrowLog::StartArrowLog(argv[0], ArrowLogLevel::ARROW_INFO); ArrowLog::InstallFailureSignalHandler(); + + gflags::SetUsageMessage("Shared-memory server for Arrow data.\nUsage: "); + gflags::SetVersionString("TODO"); + char* socket_name = nullptr; // Directory where plasma memory mapped files are stored. std::string plasma_directory; std::string external_store_endpoint; bool hugepages_enabled = false; int64_t system_memory = -1; - int c; - while ((c = getopt(argc, argv, "s:m:d:e:h")) != -1) { -switch (c) { - case 'd': -plasma_directory = std::string(optarg); -break; - case 'e': -external_store_endpoint = std::string(optarg); -break; - case 'h': -hugepages_enabled = true; -break; - case 's': -socket_name = optarg; -break; - case 'm': { -char extra; -int scanned = sscanf(optarg, "%" SCNd64 "%c", &system_memory, &extra); -ARROW_CHECK(scanned == 1); -// Set system memory capacity - plasma::PlasmaAllocator::SetFootprintLimit(static_cast(system_memory)); -ARROW_LOG(INFO) << "Allowing the Plasma store to use up to " -<< static_cast(system_memory) / 10 -<< "GB of memory."; -break; - } - default: -exit(-1); -} + + gflags::ParseCommandLineFlags(&argc, &argv, true); Review comment: please comment literal parameters /\*parameter_name=\*/ ## File path: cpp/src/plasma/store.cc ## @@ -1304,8 +1320,9 @@ int main(int argc, char* argv[]) { plasma::ExternalStores::ExtractStoreName(external_store_endpoint, &name)); external_store = plasma::ExternalStores::GetStore(name); if (external_store == nullptr) { - ARROW_LOG(FATAL) << "No such external store \"" << name << "\""; - return -1; + std::ostringstream error_msg; + error_msg << "no such external store \"" << name << "\""; + plasma::UsageError(error_msg.str().c_str(), -1); Review comment: please comment literals with parameter name /*parameter_name=*/ ## File path: cpp/src/plasma/store.cc ## @@ -1207,65 +1211,77 @@ void StartServer(char* socket_name, std::string plasma_directory, bool hugepages g_runner->Start(socket_name, plasma_directory, hugepages_enabled, external_store); } +void UsageError(const char* error_msg, int exit_code=1) { + std::cerr << gflags::ProgramInvocationShortName() << ": " << error_msg << std::endl; Review comment: might be worth a comment on why this doesn't use ARROW_LOG(FATAL) ## File path: cpp/src/plasma/store.cc ## @@ -1207,65 +1211,77 @@ void StartServer(char* socket_name, std::string plasma_directory, bool hugepages g_runner->Start(socket_name, plasma_directory, hugepages_enabled, external_store); } +void UsageError(const char* error_msg, int exit_code=1) { + std::cerr << gflags::ProgramInvocationShortName() << ": " << error_msg << std::endl; + exit(exit_code); +} + } // namespace plasma +#ifdef __linux__ +#define SHM_DEFAULT_PATH "/dev/shm" +#else +#define SHM_DEFAULT_PATH "/tmp" +#endif + +// Command-line flags. +DEFINE_string(d, SHM_DEFAULT_PATH, "directory where to create the memory-backed file"); +DEFINE_string(e, "", "endpoint for external storage service, where objects " +"evicted from Plasma store can be written to, optional"); +DEFINE_bool(h, false, "whether to enable hugepage support"); +DEFINE_string(s, "", "socket name where the Plasma store will listen for requests, required"); +DEFINE_string(m, "", "amount of memory in bytes to use for Plasma store, required"); + int main(int argc, char* argv[]) { ArrowLog::StartArrowLog(argv[0], Arro
[GitHub] [arrow] emkornfield commented on a change in pull request #7025: ARROW-2260: [C++][Plasma] Use Gflags for command-line parsing
emkornfield commented on a change in pull request #7025: URL: https://github.com/apache/arrow/pull/7025#discussion_r414974208 ## File path: cpp/src/plasma/store.cc ## @@ -1207,65 +1211,77 @@ void StartServer(char* socket_name, std::string plasma_directory, bool hugepages g_runner->Start(socket_name, plasma_directory, hugepages_enabled, external_store); } +void UsageError(const char* error_msg, int exit_code=1) { + std::cerr << gflags::ProgramInvocationShortName() << ": " << error_msg << std::endl; + exit(exit_code); +} + } // namespace plasma +#ifdef __linux__ +#define SHM_DEFAULT_PATH "/dev/shm" +#else +#define SHM_DEFAULT_PATH "/tmp" +#endif + +// Command-line flags. +DEFINE_string(d, SHM_DEFAULT_PATH, "directory where to create the memory-backed file"); +DEFINE_string(e, "", "endpoint for external storage service, where objects " +"evicted from Plasma store can be written to, optional"); +DEFINE_bool(h, false, "whether to enable hugepage support"); +DEFINE_string(s, "", "socket name where the Plasma store will listen for requests, required"); +DEFINE_string(m, "", "amount of memory in bytes to use for Plasma store, required"); + int main(int argc, char* argv[]) { ArrowLog::StartArrowLog(argv[0], ArrowLogLevel::ARROW_INFO); ArrowLog::InstallFailureSignalHandler(); + + gflags::SetUsageMessage("Shared-memory server for Arrow data.\nUsage: "); + gflags::SetVersionString("TODO"); Review comment: it seems like this could be done by adding a a define using ARROW_VERSION cmake variable. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] emkornfield commented on pull request #7025: ARROW-2260: [C++][Plasma] Use Gflags for command-line parsing
emkornfield commented on pull request #7025: URL: https://github.com/apache/arrow/pull/7025#issuecomment-619317374 @chrish42 Thank you for the PR, I'll take a look now. Note it looks like lint is failing due to formatting issues. You need to run "make format" or "ninja format" to run clang-format (we are currently using version 8 I believe). This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] emkornfield commented on a change in pull request #6912: ARROW-8020: [Java] Implement vector validate functionality
emkornfield commented on a change in pull request #6912: URL: https://github.com/apache/arrow/pull/6912#discussion_r414972130 ## File path: java/vector/src/main/java/org/apache/arrow/vector/ValueVector.java ## @@ -283,4 +283,10 @@ * @return the name of the vector. */ String getName(); + + /** + * Validate the vector, will throw exception if validate fail. + */ + void validate(); Review comment: Instead of adding this directly to ValueVector, can we make a static utility method that does validation and excepts a ValueVector? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] wjones1 commented on a change in pull request #6979: ARROW-7800 [Python] implement iter_batches() method for ParquetFile and ParquetReader
wjones1 commented on a change in pull request #6979: URL: https://github.com/apache/arrow/pull/6979#discussion_r414951095 ## File path: python/pyarrow/_parquet.pyx ## @@ -1083,6 +1084,50 @@ cdef class ParquetReader: def set_use_threads(self, bint use_threads): self.reader.get().set_use_threads(use_threads) +def iter_batches(self, int64_t batch_size, row_groups, column_indices=None, Review comment: I have those parameters because I was following the pattern set by the [`read_row_group()` and `read_row_groups()`](https://github.com/apache/arrow/blob/66a18e14f64359b73bd8ed5625c3092cd369af44/python/pyarrow/_parquet.pyx#L1131-L1136) methods in the same file, which both have a `use_threads` parameters. Same with the `read_all()` method. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] wjones1 commented on a change in pull request #6979: ARROW-7800 [Python] implement iter_batches() method for ParquetFile and ParquetReader
wjones1 commented on a change in pull request #6979: URL: https://github.com/apache/arrow/pull/6979#discussion_r414948745 ## File path: cpp/src/parquet/arrow/reader.cc ## @@ -260,12 +260,28 @@ class FileReaderImpl : public FileReader { Status GetRecordBatchReader(const std::vector& row_group_indices, const std::vector& column_indices, - std::unique_ptr* out) override; + std::unique_ptr* out, Review comment: Thanks for the feedback. I think I understand now the pattern going on with `set_use_threads()`. It's a method on the reader that [calls the method of the same name it's `ArrowReaderProperties`](https://github.com/apache/arrow/blob/master/cpp/src/parquet/arrow/reader.cc#L279-L281). I will implement the same thing for [the `set_batch_size()`](https://github.com/apache/arrow/blob/master/cpp/src/parquet/properties.h#L564) method. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] wjones1 commented on a change in pull request #6979: ARROW-7800 [Python] implement iter_batches() method for ParquetFile and ParquetReader
wjones1 commented on a change in pull request #6979: URL: https://github.com/apache/arrow/pull/6979#discussion_r414948789 ## File path: python/pyarrow/_parquet.pxd ## @@ -334,7 +334,7 @@ cdef extern from "parquet/api/reader.h" namespace "parquet" nogil: ArrowReaderProperties() void set_read_dictionary(int column_index, c_bool read_dict) c_bool read_dictionary() -void set_batch_size() +void set_batch_size(int batch_size) Review comment: 👍 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] zgramana commented on pull request #7032: ARROW-6603, ARROW-5708, ARROW-5634: [C#] Adds ArrayBuilder API to support writing null values + BooleanArray null support
zgramana commented on pull request #7032: URL: https://github.com/apache/arrow/pull/7032#issuecomment-619296269 @eerhardt apologies for loading up three issue in the title, but I kept finding older issues in the Apache Jira backlog that were addressed here as well, and so erred on the side of too much rather than too little. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] markhildreth edited a comment on pull request #7024: ARROW-8573: [Rust] Upgrade Rust to 1.44 nightly
markhildreth edited a comment on pull request #7024: URL: https://github.com/apache/arrow/pull/7024#issuecomment-619273713 @andygrove I think there is going to be more to this than this PR. The "nightly-2019-11-14" string [can be found in a few places](https://github.com/apache/arrow/search?q=nightly-2019-11-14&unscoped_q=nightly-2019-11-14), including some of the build scripts (notice that the workflows run for this PR are still called "2019-11-14"). @nevi-me @paddyhoran The problem with this error seems to be that the change was only made in rust-toolchain. As a result, the GHA workflows will still build a Rust environment with a nightly-2019-11-14 toolchain. When cargo attempts to build, it will notice that the toolchain is out of date with the `rust-toolchain` file, and attempt to pull the new toolchain on-the-fly. However, it doesn't necessarily pull `rustfmt` when this occurs. ``` installed toolchains stable-x86_64-pc-windows-msvc nightly-2019-11-14-x86_64-pc-windows-msvc active toolchain nightly-2019-11-14-x86_64-pc-windows-msvc (directory override for '\\?\D:\a\arrow\arrow') rustc 1.41.0-nightly (ded5ee001 2019-11-13) + mkdir -p /d/a/arrow/arrow/build/rust + pushd /d/a/arrow/arrow/rust /d/a/arrow/arrow/rust /d/a/arrow/arrow + RUSTFLAGS='-D warnings' + cargo build --all-targets info: syncing channel updates for 'nightly-2020-04-22-x86_64-pc-windows-msvc' info: latest update on 2020-04-22, rust version 1.44.0-nightly (45d050cde 2020-04-21) info: downloading component 'cargo' info: downloading component 'rust-std' info: downloading component 'rustc' info: installing component 'cargo' info: installing component 'rust-std' info: installing component 'rustc' ``` This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] mayuropensource edited a comment on pull request #7022: ARROW-8562: [C++] IO: Parameterize I/O Coalescing using S3 metrics
mayuropensource edited a comment on pull request #7022: URL: https://github.com/apache/arrow/pull/7022#issuecomment-619276182 // SOME_S3_DATA_URI should point to a file (over http) that is ~500 MiB. // TTFB_sec is the time-to-first-byte in seconds as measured by curl // Bandwidth_bytes_per_sec is download bandwidth (in bytes per second) as measured by curl // Once SOME_S3_DATA_URI is created, it can be queried on a regular basis for measurements. `S3_DATA_URI="SOME_S3_DATA_URI"; ` `curl --negotiate -u: -o /dev/null -w "connect_time_sec=%{time_connect} total_time_sec=%{time_total} data_size_bytes=%{size_download} TTFB_sec=%{time_starttransfer} Bandwidth_bytes_per_sec=%{speed_download}\n" $S3_DATA_URI` This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] mayuropensource commented on pull request #7022: ARROW-8562: [C++] IO: Parameterize I/O Coalescing using S3 metrics
mayuropensource commented on pull request #7022: URL: https://github.com/apache/arrow/pull/7022#issuecomment-619276182 // SOME_S3_DATA_URI should point to a file (over http) that is ~500 MiB. // TTFB_sec is the time-to-first-byte in seconds as measured by curl // Bandwidth_bytes_per_sec is download bandwidth (in bytes per second) as measured by curl `S3_DATA_URI="SOME_S3_DATA_URI"; ` `curl --negotiate -u: -o /dev/null -w "connect_time_sec=%{time_connect} total_time_sec=%{time_total} data_size_bytes=%{size_download} TTFB_sec=%{time_starttransfer} Bandwidth_bytes_per_sec=%{speed_download}\n" $S3_DATA_URI` This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] markhildreth commented on pull request #7024: ARROW-8573: [Rust] Upgrade Rust to 1.44 nightly
markhildreth commented on pull request #7024: URL: https://github.com/apache/arrow/pull/7024#issuecomment-619273713 @andygrove I think there is going to be more to this than this PR. The "nightly-2019-11-14" string [can be found in a few places](https://github.com/apache/arrow/search?q=nightly-2019-11-14&unscoped_q=nightly-2019-11-14), including some of the build scripts (notice that the workflows run for this PR are still called "2019-11-14"). @nevi-me @paddyhoran The problem with this error seems to be that the change was only made in rust-toolchain. As a result, the GHA workflows will still build a Windows environment with a nightly-2019-11-14 toolchain. When cargo attempts to build, it will notice that the toolchain is out of date with the `rust-toolchain` file, and attempt to pull the new toolchain on-the-fly. However, it doesn't pull `rust fmt` when this occurs. ``` installed toolchains stable-x86_64-pc-windows-msvc nightly-2019-11-14-x86_64-pc-windows-msvc active toolchain nightly-2019-11-14-x86_64-pc-windows-msvc (directory override for '\\?\D:\a\arrow\arrow') rustc 1.41.0-nightly (ded5ee001 2019-11-13) + mkdir -p /d/a/arrow/arrow/build/rust + pushd /d/a/arrow/arrow/rust /d/a/arrow/arrow/rust /d/a/arrow/arrow + RUSTFLAGS='-D warnings' + cargo build --all-targets info: syncing channel updates for 'nightly-2020-04-22-x86_64-pc-windows-msvc' info: latest update on 2020-04-22, rust version 1.44.0-nightly (45d050cde 2020-04-21) info: downloading component 'cargo' info: downloading component 'rust-std' info: downloading component 'rustc' info: installing component 'cargo' info: installing component 'rust-std' info: installing component 'rustc' ``` This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] velvia commented on a change in pull request #4815: [DISCUSS] Add strawman proposal for sparseness and data integrity
velvia commented on a change in pull request #4815: URL: https://github.com/apache/arrow/pull/4815#discussion_r414877852 ## File path: format/Message.fbs ## @@ -21,10 +21,69 @@ include "Tensor.fbs"; namespace org.apache.arrow.flatbuf; +/// -- +/// Buffer encoding schemes. +/// --- + +/// Encoding for buffers representing integer as offsets from a reference value. +/// This encoding uses less bits then the logical type indicates. +/// It saves space when all values in the buffer can be represented with a +/// small bit width (e.g. if all values in an int64 column are between -128 +/// and 127, then a bit-width of 8 can be be used) offset from the +/// reference value. +table FrameOfReferenceIntEncoding { + /// The value that all values in the buffer are relative to. + reference_value: long = 0; Review comment: Depending on the size of your batch, a sloped representation would result in far smaller arrays, since the delta from a slope is typically much smaller and can fit into less bits. You can still do O(1) access to any element, just compute ax + b etc. Assuming the data is actually increasing, of course - otherwise step wise is fine. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] nevi-me commented on a change in pull request #7036: ARROW-8591: [Rust] Reverse lookup for a key in DictionaryArray
nevi-me commented on a change in pull request #7036: URL: https://github.com/apache/arrow/pull/7036#discussion_r414875192 ## File path: rust/arrow/src/array/array.rs ## @@ -1786,38 +1786,34 @@ impl From<(Vec<(Field, ArrayRef)>, Buffer, usize)> for StructArray { /// This is mostly used to represent strings or a limited set of primitive types as integers, /// for example when doing NLP analysis or representing chromosomes by name. /// -/// Example with nullable data: +/// Example **with nullable** data: /// /// ``` -/// use arrow::array::DictionaryArray; -/// use arrow::datatypes::Int8Type; -/// let test = vec!["a", "a", "b", "c"]; -/// let array : DictionaryArray = test.iter().map(|&x| if x == "b" {None} else {Some(x)}).collect(); -/// assert_eq!(array.keys().collect::>>(), vec![Some(0), Some(0), None, Some(1)]); +/// use arrow::array::DictionaryArray; +/// use arrow::datatypes::Int8Type; +/// let test = vec!["a", "a", "b", "c"]; +/// let array : DictionaryArray = test.iter().map(|&x| if x == "b" {None} else {Some(x)}).collect(); +/// assert_eq!(array.keys().collect::>>(), vec![Some(0), Some(0), None, Some(1)]); /// ``` /// -/// Example without nullable data: +/// Example **without nullable** data: /// /// ``` -/// -/// use arrow::array::DictionaryArray; -/// use arrow::datatypes::Int8Type; -/// let test = vec!["a", "a", "b", "c"]; -/// let array : DictionaryArray = test.into_iter().collect(); -/// assert_eq!(array.keys().collect::>>(), vec![Some(0), Some(0), Some(1), Some(2)]); +/// use arrow::array::DictionaryArray; +/// use arrow::datatypes::Int8Type; +/// let test = vec!["a", "a", "b", "c"]; +/// let array : DictionaryArray = test.into_iter().collect(); +/// assert_eq!(array.keys().collect::>>(), vec![Some(0), Some(0), Some(1), Some(2)]); /// ``` pub struct DictionaryArray { -// Array of keys, much like a PrimitiveArray +/// Array of keys, much like a PrimitiveArray data: ArrayDataRef, -// Pointer to the key values. +/// Pointer to the key values. raw_values: RawPtrBox, -// Array of any values. +/// Array of any values. values: ArrayRef, - Review comment: `is_ordered` is needed for IPC purposes, so we shouldn't remove it for compatibility purposes This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] github-actions[bot] commented on pull request #7037: ARROW-6718: [Rust] Remove packed_simd
github-actions[bot] commented on pull request #7037: URL: https://github.com/apache/arrow/pull/7037#issuecomment-619242490 https://issues.apache.org/jira/browse/ARROW-6718 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] nevi-me opened a new pull request #7037: ARROW-6718: [Rust] Remove packed_simd
nevi-me opened a new pull request #7037: URL: https://github.com/apache/arrow/pull/7037 This removes the dependency on packed_simd. I initially thought that boolean kernels were slower than with explicit SIMD, but this was a false alarm as the benchmarks weren't comparing SIMD vs non-SIMD. While doing this, I noticed that the `divide` kernel appears to be unsound, as it checks if a null is 0 (which can be true when the default data behind the bitmask is 0). Below is the performance comparison: From 0.15.0 to 0.16.0 ```rust Running target/release/deps/arithmetic_kernels-ba6ab3db9f184b40 add 512 time: [15.565 us 15.623 us 15.694 us] change: [-66.359% -66.104% -65.861%] (p = 0.00 < 0.05) Performance has improved. Found 4 outliers among 100 measurements (4.00%) 3 (3.00%) high mild 1 (1.00%) high severe add 512 simdtime: [14.939 us 16.768 us 18.744 us] change: [+1.4006% +6.0795% +11.131%] (p = 0.02 < 0.05) Performance has regressed. Found 9 outliers among 100 measurements (9.00%) 1 (1.00%) high mild 8 (8.00%) high severe subtract 512time: [15.659 us 15.727 us 15.799 us] change: [-65.994% -65.847% -65.690%] (p = 0.00 < 0.05) Performance has improved. subtract 512 simd time: [14.003 us 14.119 us 14.284 us] change: [-4.9276% -3.2446% -1.6479%] (p = 0.00 < 0.05) Performance has improved. Found 5 outliers among 100 measurements (5.00%) 2 (2.00%) high mild 3 (3.00%) high severe multiply 512time: [15.774 us 15.824 us 15.875 us] change: [-65.694% -65.526% -65.352%] (p = 0.00 < 0.05) Performance has improved. Found 1 outliers among 100 measurements (1.00%) 1 (1.00%) high mild multiply 512 simd time: [14.299 us 14.458 us 14.681 us] change: [-0.9771% -0.0444% +0.9882%] (p = 0.93 > 0.05) No change in performance detected. Found 5 outliers among 100 measurements (5.00%) 2 (2.00%) high mild 3 (3.00%) high severe divide 512 time: [16.690 us 16.731 us 16.774 us] change: [-65.394% -65.012% -64.701%] (p = 0.00 < 0.05) Performance has improved. Found 3 outliers among 100 measurements (3.00%) 3 (3.00%) high mild divide 512 simd time: [16.098 us 16.147 us 16.202 us] change: [-3.6005% -2.6939% -1.9439%] (p = 0.00 < 0.05) Performance has improved. Found 3 outliers among 100 measurements (3.00%) 2 (2.00%) high mild 1 (1.00%) high severe sum 512 no simd time: [7.1888 us 7.2836 us 7.4349 us] change: [-1.2993% -0.2501% +1.2521%] (p = 0.73 > 0.05) No change in performance detected. Found 6 outliers among 100 measurements (6.00%) 3 (3.00%) high mild 3 (3.00%) high severe limit 512, 256 no simd time: [6.8801 us 6.9257 us 6.9792 us] change: [-3.8909% -2.7450% -1.6742%] (p = 0.00 < 0.05) Performance has improved. Found 9 outliers among 100 measurements (9.00%) 2 (2.00%) high mild 7 (7.00%) high severe limit 512, 512 no simd time: [6.8552 us 6.9007 us 6.9552 us] change: [-36.783% -31.294% -25.031%] (p = 0.00 < 0.05) Performance has improved. Found 7 outliers among 100 measurements (7.00%) 4 (4.00%) high mild 3 (3.00%) high severe Running target/release/deps/array_from_vec-9acb1269f64e7733 array_from_vec 128 time: [418.62 ns 423.66 ns 430.30 ns] change: [-2.2547% -0.6846% +0.9641%] (p = 0.48 > 0.05) No change in performance detected. Found 8 outliers among 100 measurements (8.00%) 3 (3.00%) high mild 5 (5.00%) high severe array_from_vec 256 time: [659.91 ns 661.68 ns 663.62 ns] change: [-2.1474% -1.6329% -1.1820%] (p = 0.00 < 0.05) Performance has improved. Found 3 outliers among 100 measurements (3.00%) 2 (2.00%) high mild 1 (1.00%) high severe array_from_vec 512 time: [1.1200 us 1.1244 us 1.1304 us] change: [-2.9911% -2.3466% -1.7654%] (p = 0.00 < 0.05) Performance has improved. Found 2 outliers among 100 measurements (2.00%) 1 (1.00%) high mild 1 (1.00%) high severe Running target/release/deps/boolean_kernels-25e7d12fe4fd7f63 and
[GitHub] [arrow] bkietz commented on a change in pull request #7033: ARROW-7759: [C++][Dataset] Add CsvFileFormat
bkietz commented on a change in pull request #7033: URL: https://github.com/apache/arrow/pull/7033#discussion_r414856596 ## File path: cpp/src/arrow/dataset/file_csv.h ## @@ -0,0 +1,52 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/csv/options.h" +#include "arrow/dataset/file_base.h" +#include "arrow/dataset/type_fwd.h" +#include "arrow/dataset/visibility.h" +#include "arrow/result.h" + +namespace arrow { +namespace dataset { + +/// \brief A FileFormat implementation that reads from and writes to Csv files +class ARROW_DS_EXPORT CsvFileFormat : public FileFormat { + public: + csv::ParseOptions parse_options = csv::ParseOptions::Defaults(); Review comment: fmt->parse_options.delimiter = '\t' For example This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] nealrichardson commented on a change in pull request #7033: ARROW-7759: [C++][Dataset] Add CsvFileFormat
nealrichardson commented on a change in pull request #7033: URL: https://github.com/apache/arrow/pull/7033#discussion_r414852277 ## File path: cpp/src/arrow/dataset/file_csv.h ## @@ -0,0 +1,52 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include +#include + +#include "arrow/csv/options.h" +#include "arrow/dataset/file_base.h" +#include "arrow/dataset/type_fwd.h" +#include "arrow/dataset/visibility.h" +#include "arrow/result.h" + +namespace arrow { +namespace dataset { + +/// \brief A FileFormat implementation that reads from and writes to Csv files +class ARROW_DS_EXPORT CsvFileFormat : public FileFormat { + public: + csv::ParseOptions parse_options = csv::ParseOptions::Defaults(); Review comment: How do I provide these options? `auto fmt = std::make_shared();` and then like `fmt.parse_options = something`? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] github-actions[bot] commented on pull request #7036: ARROW-8591: [Rust] Reverse lookup for a key in DictionaryArray
github-actions[bot] commented on pull request #7036: URL: https://github.com/apache/arrow/pull/7036#issuecomment-619219685 https://issues.apache.org/jira/browse/ARROW-8591 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] github-actions[bot] commented on pull request #7035: ARROW-8590: [Rust] Use arrow crate pretty util in DataFusion
github-actions[bot] commented on pull request #7035: URL: https://github.com/apache/arrow/pull/7035#issuecomment-619219686 https://issues.apache.org/jira/browse/ARROW-8590 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] vertexclique opened a new pull request #7036: ARROW-8591: [Rust] Reverse lookup for a key in DictionaryArray
vertexclique opened a new pull request #7036: URL: https://github.com/apache/arrow/pull/7036 This PR enables reverse lookup for already built dict. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] markhildreth commented on pull request #6972: ARROW-8287: [Rust] Add "pretty" util to help with printing tabular output of RecordBatches
markhildreth commented on pull request #6972: URL: https://github.com/apache/arrow/pull/6972#issuecomment-619215502 Created [follow-up JIRA task](https://issues.apache.org/jira/browse/ARROW-8590). This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] markhildreth opened a new pull request #7035: ARROW-8590: [Rust] Use arrow crate pretty util in DataFusion
markhildreth opened a new pull request #7035: URL: https://github.com/apache/arrow/pull/7035 Fixes [ARROW-8590](https://issues.apache.org/jira/browse/ARROW-8590) This builds on #6972, and thus should be merged after that PR is merged. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] nealrichardson commented on pull request #6879: ARROW-8377: [CI][C++][R] Build and run C++ tests on Rtools build
nealrichardson commented on pull request #6879: URL: https://github.com/apache/arrow/pull/6879#issuecomment-619212781 @github-actions rebase This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] nealrichardson commented on pull request #7033: ARROW-7759: [C++][Dataset] Add CsvFileFormat
nealrichardson commented on pull request #7033: URL: https://github.com/apache/arrow/pull/7033#issuecomment-619213038 @github-actions rebase This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] nealrichardson commented on issue #7034: R arrow package can't see arrow-cpp installation
nealrichardson commented on issue #7034: URL: https://github.com/apache/arrow/issues/7034#issuecomment-619210756 We don't do any testing on NixOS, so it's not surprising that it doesn't just work. http://arrow.apache.org/docs/r/articles/install.html describes how dependencies are resolved and notes the relevant files inside the `r` directory of apache/arrow. It sounds like http://arrow.apache.org/docs/r/articles/install.html#using-system-libraries is most relevant since you say you already have Arrow C++ installed, so maybe you can set one of those environment variables to help it find them. If you have any fixes to enable it to work on your OS, please [open a JIRA ticket](https://issues.apache.org/jira/browse/ARROW) and I'll be happy to review your PR. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] Zhen-hao opened a new issue #7034: R arrow package can't see arrow-cpp installation
Zhen-hao opened a new issue #7034: URL: https://github.com/apache/arrow/issues/7034 Hi there, this is more a question than a bug request. I am using NixOS 20.03 and couldn't get the arrow library in R to see the arrow C++ library. Even when I install the library from R itself, it can't find the C++ library after restarting. The following is the installation log in R. This problem might be specific to NixOS. But if anyone can point to the source code that checks for the C++ library, I might be able to help to fix this for NixOS. Thanks in advance! ``` > arrow::install_arrow() Installing package into ‘/home/zhen/R/x86_64-pc-linux-gnu-library/3.6’ (as ‘lib’ is unspecified) trying URL 'https://cran.rstudio.com/src/contrib/arrow_0.17.0.tar.gz' Content type 'application/x-gzip' length 242534 bytes (236 KB) == downloaded 236 KB * installing *source* package ‘arrow’ ... ** package ‘arrow’ successfully unpacked and MD5 sums checked ** using staged installation *** No C++ binaries found for nixos-20 *** Successfully retrieved C++ source *** Building C++ libraries cmake arrow ./configure: line 132: cd: libarrow/arrow-0.17.0/lib: No such file or directory - NOTE --- After installation, please run arrow::install_arrow() for help installing required runtime libraries - ** libs /nix/store/x9vxn05bn19wn2mzn6d4h56glbd02pyr-gcc-wrapper-9.3.0/bin/c++ -std=gnu++11 -I"/nix/store/jk3gaiy3g3fs2866r194yxy0mppyix8n-R-3.6.3/lib/R/include" -DNDEBUG -I"/nix/store/58d1yk61akanrlq3pwnbhbb62vrwk6a6-r-Rcpp-1.0.4/library/Rcpp/include" -fpic -g -O2 -c array.cpp -o array.o /nix/store/x9vxn05bn19wn2mzn6d4h56glbd02pyr-gcc-wrapper-9.3.0/bin/c++ -std=gnu++11 -I"/nix/store/jk3gaiy3g3fs2866r194yxy0mppyix8n-R-3.6.3/lib/R/include" -DNDEBUG -I"/nix/store/58d1yk61akanrlq3pwnbhbb62vrwk6a6-r-Rcpp-1.0.4/library/Rcpp/include" -fpic -g -O2 -c array_from_vector.cpp -o array_from_vector.o /nix/store/x9vxn05bn19wn2mzn6d4h56glbd02pyr-gcc-wrapper-9.3.0/bin/c++ -std=gnu++11 -I"/nix/store/jk3gaiy3g3fs2866r194yxy0mppyix8n-R-3.6.3/lib/R/include" -DNDEBUG -I"/nix/store/58d1yk61akanrlq3pwnbhbb62vrwk6a6-r-Rcpp-1.0.4/library/Rcpp/include" -fpic -g -O2 -c array_to_vector.cpp -o array_to_vector.o /nix/store/x9vxn05bn19wn2mzn6d4h56glbd02pyr-gcc-wrapper-9.3.0/bin/c++ -std=gnu++11 -I"/nix/store/jk3gaiy3g3fs2866r194yxy0mppyix8n-R-3.6.3/lib/R/include" -DNDEBUG -I"/nix/store/58d1yk61akanrlq3pwnbhbb62vrwk6a6-r-Rcpp-1.0.4/library/Rcpp/include" -fpic -g -O2 -c arraydata.cpp -o arraydata.o /nix/store/x9vxn05bn19wn2mzn6d4h56glbd02pyr-gcc-wrapper-9.3.0/bin/c++ -std=gnu++11 -I"/nix/store/jk3gaiy3g3fs2866r194yxy0mppyix8n-R-3.6.3/lib/R/include" -DNDEBUG -I"/nix/store/58d1yk61akanrlq3pwnbhbb62vrwk6a6-r-Rcpp-1.0.4/library/Rcpp/include" -fpic -g -O2 -c arrowExports.cpp -o arrowExports.o /nix/store/x9vxn05bn19wn2mzn6d4h56glbd02pyr-gcc-wrapper-9.3.0/bin/c++ -std=gnu++11 -I"/nix/store/jk3gaiy3g3fs2866r194yxy0mppyix8n-R-3.6.3/lib/R/include" -DNDEBUG -I"/nix/store/58d1yk61akanrlq3pwnbhbb62vrwk6a6-r-Rcpp-1.0.4/library/Rcpp/include" -fpic -g -O2 -c buffer.cpp -o buffer.o /nix/store/x9vxn05bn19wn2mzn6d4h56glbd02pyr-gcc-wrapper-9.3.0/bin/c++ -std=gnu++11 -I"/nix/store/jk3gaiy3g3fs2866r194yxy0mppyix8n-R-3.6.3/lib/R/include" -DNDEBUG -I"/nix/store/58d1yk61akanrlq3pwnbhbb62vrwk6a6-r-Rcpp-1.0.4/library/Rcpp/include" -fpic -g -O2 -c chunkedarray.cpp -o chunkedarray.o /nix/store/x9vxn05bn19wn2mzn6d4h56glbd02pyr-gcc-wrapper-9.3.0/bin/c++ -std=gnu++11 -I"/nix/store/jk3gaiy3g3fs2866r194yxy0mppyix8n-R-3.6.3/lib/R/include" -DNDEBUG -I"/nix/store/58d1yk61akanrlq3pwnbhbb62vrwk6a6-r-Rcpp-1.0.4/library/Rcpp/include" -fpic -g -O2 -c compression.cpp -o compression.o /nix/store/x9vxn05bn19wn2mzn6d4h56glbd02pyr-gcc-wrapper-9.3.0/bin/c++ -std=gnu++11 -I"/nix/store/jk3gaiy3g3fs2866r194yxy0mppyix8n-R-3.6.3/lib/R/include" -DNDEBUG -I"/nix/store/58d1yk61akanrlq3pwnbhbb62vrwk6a6-r-Rcpp-1.0.4/library/Rcpp/include" -fpic -g -O2 -c compute.cpp -o compute.o /nix/store/x9vxn05bn19wn2mzn6d4h56glbd02pyr-gcc-wrapper-9.3.0/bin/c++ -std=gnu++11 -I"/nix/store/jk3gaiy3g3fs2866r194yxy0mppyix8n-R-3.6.3/lib/R/include" -DNDEBUG -I"/nix/store/58d1yk61akanrlq3pwnbhbb62vrwk6a6-r-Rcpp-1.0.4/library/Rcpp/include" -fpic -g -O2 -c csv.cpp -o csv.o /nix/store/x9vxn05bn19wn2mzn6d4h56glbd02pyr-gcc-wrapper-9.3.0/bin/c++ -std=gnu++11 -I"/nix/store/jk3gaiy3g3fs2866r194yxy0mppyix8n-R-3.6.3/lib/R/include" -DNDEBUG -I"/nix/store/58d1yk61akanrlq3pwnbhbb62vrwk6a6-r-Rcpp-1.0.4/library/Rcpp/include" -fpic -g -O2 -c dataset.cpp -o dataset.o /nix/store/x9vxn05bn19wn2mzn6d4h56glbd02pyr-gcc-wrapper-9.3.0/bin/c++ -std=gnu++11 -I"/nix/store/jk3gaiy3g3fs2866
[GitHub] [arrow] andygrove commented on pull request #7018: ARROW-8536: [Rust] [Flight] Check in proto file, conditional build if file exists
andygrove commented on pull request #7018: URL: https://github.com/apache/arrow/pull/7018#issuecomment-619202633 Let's see what others say on this. Personally, I think it would be better for build.rs to automatically prepend the ASF license header because there is the risk of someone making manual changes to the file as part of a PR. I know it is extra work, but it seems safer. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] github-actions[bot] commented on pull request #7033: ARROW-7759: [C++][Dataset] Add CsvFileFormat
github-actions[bot] commented on pull request #7033: URL: https://github.com/apache/arrow/pull/7033#issuecomment-619195315 https://issues.apache.org/jira/browse/ARROW-7759 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] bkietz opened a new pull request #7033: ARROW-7759: [C++][Dataset] Add CsvFileFormat
bkietz opened a new pull request #7033: URL: https://github.com/apache/arrow/pull/7033 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] nevi-me commented on pull request #7024: ARROW-8573: [Rust] Upgrade Rust to 1.44 nightly
nevi-me commented on pull request #7024: URL: https://github.com/apache/arrow/pull/7024#issuecomment-619189091 @paddyhoran we might have to try a different nightly, as sometimes a day's version might have no rustfmt. The change I made in that PR installs a nightly version, I don't know how to make GHA install a stable version of rustfmt This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] fsaintjacques commented on pull request #7022: ARROW-8562: [C++] IO: Parameterize I/O Coalescing using S3 metrics
fsaintjacques commented on pull request #7022: URL: https://github.com/apache/arrow/pull/7022#issuecomment-619186142 I'd say just plain HTTP, as @lidavidm pointed in his comment, this is a network attribute. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] mayuropensource commented on pull request #7022: ARROW-8562: [C++] IO: Parameterize I/O Coalescing using S3 metrics
mayuropensource commented on pull request #7022: URL: https://github.com/apache/arrow/pull/7022#issuecomment-619184138 @fsaintjacques, I can try to put together a python script using boto to determine the S3 metrics. Will that work for you? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] github-actions[bot] commented on pull request #7032: ARROW-6603: [C#] Adds ArrayBuilder API to support writing null values
github-actions[bot] commented on pull request #7032: URL: https://github.com/apache/arrow/pull/7032#issuecomment-619164076 https://issues.apache.org/jira/browse/ARROW-6603 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] zgramana commented on pull request #6121: ARROW-6603: [C#] - Nullable Array Support
zgramana commented on pull request #6121: URL: https://github.com/apache/arrow/pull/6121#issuecomment-619162558 @eerhardt I've just submitted https://github.com/apache/arrow/pull/7032 for review/discussion This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] zgramana opened a new pull request #7032: ARROW-6603: [C#] Adds ArrayBuilder API to support writing null values
zgramana opened a new pull request #7032: URL: https://github.com/apache/arrow/pull/7032 Takes an alternative approach to completing [ARROW-6603](https://issues.apache.org/jira/browse/ARROW-6603) that is in-line with the current API and with other Arrow implementations. More specifically, this PR finishes the previously stubbed out implementation already in the codebase (e.g. uses the existing `NullBitmapBuffer`). The biggest challenge was finding a mechanism capable of supporting both nullable and non-nullable types with minimal changes to existing code. This was accomplished with the addition of the following builder interface member: ```csharp public interface IArrowArrayBuilder : IArrowArrayBuilder where TArray : IArrowArray where TBuilder : IArrowArrayBuilder { ... TBuilder AppendNull(); ... } ``` The bulk of the implementation work focuses on adding the implementation of `AppendNull` to `PrimitiveArrayBuilder` and `Binary.BuilderBase`, and removing hardcoded `0`'s passed as arguments to `nullCount` in the `ArrayData` constructors. This adds two new tests to `ArrayBuilderTests`: * The first includes a number of `null` scenarios using `TestArrayBuilder(...)`. * The second includes a `StringArray.Builder` scenario with mixed `null` and `string.Empty` values. I implemented it such that `string.Empty` is considered a valid value (so it increments `Offset` without adding any bytes to `ValueBuffer`. When `null` is passed to `Append` it will just invoke `AppendNull` internally instead. And adds two new tests to `ArrowArrayTests` which focus on the behavior of `Slice`: * `SlicePrimitiveArrayWithNulls` * `SliceStringArrayWithNullsAndEmptyStrings` All 162 (existing + new) tests passed locally at the time this was submitted This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] github-actions[bot] commented on pull request #7031: ARROW-8587: [C++] Fix linking Flight benchmarks
github-actions[bot] commented on pull request #7031: URL: https://github.com/apache/arrow/pull/7031#issuecomment-619156566 https://issues.apache.org/jira/browse/ARROW-8587 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] pitrou opened a new pull request #7031: ARROW-8587: [C++] Fix linking Flight benchmarks
pitrou opened a new pull request #7031: URL: https://github.com/apache/arrow/pull/7031 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] sunchao commented on a change in pull request #6935: ARROW-8455: [Rust] Parquet Arrow column read on partially compatible files
sunchao commented on a change in pull request #6935: URL: https://github.com/apache/arrow/pull/6935#discussion_r414738189 ## File path: rust/parquet/src/column/reader.rs ## @@ -190,15 +190,12 @@ impl ColumnReaderImpl { (self.num_buffered_values - self.num_decoded_values) as usize, ); -// Adjust batch size by taking into account how much space is left in -// values slice or levels slices (if available) -adjusted_size = min(adjusted_size, values.len() - values_read); -if let Some(ref levels) = def_levels { -adjusted_size = min(adjusted_size, levels.len() - levels_read); -} -if let Some(ref levels) = rep_levels { -adjusted_size = min(adjusted_size, levels.len() - levels_read); -} +// Adjust batch size by taking into account how much data there +// to read. As batch_size is also smaller than value and level +// slices (if available), this ensures that available space is not +// exceeded. +adjusted_size = min(adjusted_size, batch_size - values_read); Review comment: Yes. I just struggled to come up with a concrete example for this (i.e., with what values for `batch_size`, `def_levels.len()`, `rep_levels.len()` and `values.len()` this will occur). :) This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] kiszk commented on a change in pull request #7029: ARROW-8579 [C++] Add AVX512 SIMD for spaced decoding and encoding.
kiszk commented on a change in pull request #7029: URL: https://github.com/apache/arrow/pull/7029#discussion_r414739332 ## File path: cpp/src/arrow/util/spaced.h ## @@ -0,0 +1,266 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/util/bit_util.h" + +#ifdef ARROW_HAVE_AVX512 +#include +#endif // ARROW_HAVE_AVX512 + +namespace arrow { +namespace util { +namespace internal { + +template +int PutSpacedScalar(const T* values, int num_values, const uint8_t* valid_bits, +int64_t valid_bits_offset, T* output) { + int num_valid_values = 0; + arrow::internal::BitmapReader valid_bits_reader(valid_bits, valid_bits_offset, + num_values); + for (int32_t i = 0; i < num_values; i++) { +if (valid_bits_reader.IsSet()) { + output[num_valid_values++] = values[i]; +} +valid_bits_reader.Next(); + } + return num_valid_values; +} + +template +int DecodeSpacedScalar(T* buffer, int num_values, int null_count, + const uint8_t* valid_bits, int64_t valid_bits_offset) { + const int values_read = num_values - null_count; + + // Depending on the number of nulls, some of the value slots in buffer may + // be uninitialized, and this will cause valgrind warnings / potentially UB + memset(static_cast(buffer + values_read), 0, + (num_values - values_read) * sizeof(T)); + + // Add spacing for null entries. As we have filled the buffer from the front, + // we need to add the spacing from the back. + int values_to_move = values_read - 1; + // We stop early on one of two conditions: + // 1. There are no more null values that need spacing. Note we infer this + // backwards, when 'i' is equal to 'values_to_move' it indicates + //all nulls have been consumed. + // 2. There are no more non-null values that need to move which indicates + //all remaining slots are null, so their exact value doesn't matter. + for (int i = num_values - 1; (i > values_to_move) && (values_to_move >= 0); i--) { +if (BitUtil::GetBit(valid_bits, valid_bits_offset + i)) { + buffer[i] = buffer[values_to_move]; + values_to_move--; +} + } + return num_values; +} + +#if defined(ARROW_HAVE_AVX512) +template +int PutSpacedAvx512Compress(const T* values, int num_values, const uint8_t* valid_bits, +int64_t valid_bits_offset, T* output) { + assert(sizeof(T) == 4 || sizeof(T) == 8); // Only support epi32 and epi64 Review comment: i see This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] sunchao commented on a change in pull request #6935: ARROW-8455: [Rust] Parquet Arrow column read on partially compatible files
sunchao commented on a change in pull request #6935: URL: https://github.com/apache/arrow/pull/6935#discussion_r414738189 ## File path: rust/parquet/src/column/reader.rs ## @@ -190,15 +190,12 @@ impl ColumnReaderImpl { (self.num_buffered_values - self.num_decoded_values) as usize, ); -// Adjust batch size by taking into account how much space is left in -// values slice or levels slices (if available) -adjusted_size = min(adjusted_size, values.len() - values_read); -if let Some(ref levels) = def_levels { -adjusted_size = min(adjusted_size, levels.len() - levels_read); -} -if let Some(ref levels) = rep_levels { -adjusted_size = min(adjusted_size, levels.len() - levels_read); -} +// Adjust batch size by taking into account how much data there +// to read. As batch_size is also smaller than value and level +// slices (if available), this ensures that available space is not +// exceeded. +adjusted_size = min(adjusted_size, batch_size - values_read); Review comment: Yes. I just struggled to come up with a concrete example for this. :) This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] sunchao commented on pull request #6949: ARROW-7681: [Rust] Explicitly seeking a BufReader will discard the internal buffer (2)
sunchao commented on pull request #6949: URL: https://github.com/apache/arrow/pull/6949#issuecomment-619140531 > It's the reader (file handle) that is passed to it that should be thread safe Is [file](https://doc.rust-lang.org/std/fs/struct.File.html) thread-safe? it's not obvious when reading the doc. Plus, here type parameter`R` can be anything that implements `ParquetReader` (with read, seek, length and try_clone capabilities) so you cannot assume all of them guarantees thread-safety. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] BryanCutler commented on a change in pull request #6323: ARROW-7610: [Java] Finish support for 64 bit int allocations
BryanCutler commented on a change in pull request #6323: URL: https://github.com/apache/arrow/pull/6323#discussion_r414695756 ## File path: java/memory/src/main/java/org/apache/arrow/memory/NettyAllocationManager.java ## @@ -17,48 +17,97 @@ package org.apache.arrow.memory; -import org.apache.arrow.memory.util.LargeMemoryUtil; - import io.netty.buffer.PooledByteBufAllocatorL; import io.netty.buffer.UnsafeDirectLittleEndian; +import io.netty.util.internal.PlatformDependent; /** - * The default implementation of AllocationManagerBase. The implementation is responsible for managing when memory + * The default implementation of {@link AllocationManager}. The implementation is responsible for managing when memory * is allocated and returned to the Netty-based PooledByteBufAllocatorL. */ public class NettyAllocationManager extends AllocationManager { public static final Factory FACTORY = new Factory(); + /** + * The default cut-off value for switching allocation strategies. + * If the request size is not greater than the cut-off value, we will allocate memory by + * {@link PooledByteBufAllocatorL} APIs, + * otherwise, we will use {@link PlatformDependent} APIs. + */ + public static final long DEFAULT_ALLOCATION_CUTOFF_VALUE; + + public static final String DEFAULT_ALLOCATION_CUTOFF_NAME = "default.allocation.cutoff.name"; + + static { +long cutOffValue; +try { + cutOffValue = Long.parseLong(System.getProperty(DEFAULT_ALLOCATION_CUTOFF_NAME)); +} catch (Exception e) { + cutOffValue = Integer.MAX_VALUE; +} +DEFAULT_ALLOCATION_CUTOFF_VALUE = cutOffValue; + } + private static final PooledByteBufAllocatorL INNER_ALLOCATOR = new PooledByteBufAllocatorL(); static final UnsafeDirectLittleEndian EMPTY = INNER_ALLOCATOR.empty; static final long CHUNK_SIZE = INNER_ALLOCATOR.getChunkSize(); - private final int allocatedSize; + private final long allocatedSize; private final UnsafeDirectLittleEndian memoryChunk; + private final long allocatedAddress; + + /** + * The cut-off value for switching allocation strategies. + */ + private final long allocationCutOffValue; Review comment: This should be an int because it can't go above `Int.MAX_VALUE` ## File path: java/memory/src/main/java/org/apache/arrow/memory/NettyAllocationManager.java ## @@ -17,48 +17,97 @@ package org.apache.arrow.memory; -import org.apache.arrow.memory.util.LargeMemoryUtil; - import io.netty.buffer.PooledByteBufAllocatorL; import io.netty.buffer.UnsafeDirectLittleEndian; +import io.netty.util.internal.PlatformDependent; /** - * The default implementation of AllocationManagerBase. The implementation is responsible for managing when memory + * The default implementation of {@link AllocationManager}. The implementation is responsible for managing when memory * is allocated and returned to the Netty-based PooledByteBufAllocatorL. */ public class NettyAllocationManager extends AllocationManager { public static final Factory FACTORY = new Factory(); + /** + * The default cut-off value for switching allocation strategies. + * If the request size is not greater than the cut-off value, we will allocate memory by + * {@link PooledByteBufAllocatorL} APIs, + * otherwise, we will use {@link PlatformDependent} APIs. + */ + public static final long DEFAULT_ALLOCATION_CUTOFF_VALUE; + + public static final String DEFAULT_ALLOCATION_CUTOFF_NAME = "default.allocation.cutoff.name"; + + static { +long cutOffValue; +try { + cutOffValue = Long.parseLong(System.getProperty(DEFAULT_ALLOCATION_CUTOFF_NAME)); +} catch (Exception e) { + cutOffValue = Integer.MAX_VALUE; +} +DEFAULT_ALLOCATION_CUTOFF_VALUE = cutOffValue; + } + private static final PooledByteBufAllocatorL INNER_ALLOCATOR = new PooledByteBufAllocatorL(); static final UnsafeDirectLittleEndian EMPTY = INNER_ALLOCATOR.empty; static final long CHUNK_SIZE = INNER_ALLOCATOR.getChunkSize(); - private final int allocatedSize; + private final long allocatedSize; private final UnsafeDirectLittleEndian memoryChunk; + private final long allocatedAddress; + + /** + * The cut-off value for switching allocation strategies. + */ + private final long allocationCutOffValue; - NettyAllocationManager(BaseAllocator accountingAllocator, int requestedSize) { + NettyAllocationManager(BaseAllocator accountingAllocator, long requestedSize, long allocationCutOffValue) { super(accountingAllocator); -this.memoryChunk = INNER_ALLOCATOR.allocate(requestedSize); -this.allocatedSize = memoryChunk.capacity(); +if (allocationCutOffValue > Integer.MAX_VALUE) { + throw new IllegalArgumentException("The cut-off value cannot be larger than Integer.MAX_VALUE"); +} +this.allocationCutOffValue = allocationCutOffValue; + +if (requestedSize > allocationCutOffValue) { + this.memoryChunk =
[GitHub] [arrow] nealrichardson commented on a change in pull request #7026: ARROW-7391: [C++][Dataset] Remove Expression subclasses from bindings
nealrichardson commented on a change in pull request #7026: URL: https://github.com/apache/arrow/pull/7026#discussion_r414680234 ## File path: r/src/expression.cpp ## @@ -21,99 +21,97 @@ // [[arrow::export]] std::shared_ptr dataset___expr__field_ref(std::string name) { - return std::make_shared(std::move(name)); + return ds::field_ref(std::move(name)); } // [[arrow::export]] -std::shared_ptr dataset___expr__equal( +std::shared_ptr dataset___expr__equal( const std::shared_ptr& lhs, const std::shared_ptr& rhs) { return ds::equal(lhs, rhs); } // [[arrow::export]] -std::shared_ptr dataset___expr__not_equal( +std::shared_ptr dataset___expr__not_equal( const std::shared_ptr& lhs, const std::shared_ptr& rhs) { return ds::not_equal(lhs, rhs); } // [[arrow::export]] -std::shared_ptr dataset___expr__greater( +std::shared_ptr dataset___expr__greater( const std::shared_ptr& lhs, const std::shared_ptr& rhs) { return ds::greater(lhs, rhs); } // [[arrow::export]] -std::shared_ptr dataset___expr__greater_equal( +std::shared_ptr dataset___expr__greater_equal( const std::shared_ptr& lhs, const std::shared_ptr& rhs) { return ds::greater_equal(lhs, rhs); } // [[arrow::export]] -std::shared_ptr dataset___expr__less( +std::shared_ptr dataset___expr__less( const std::shared_ptr& lhs, const std::shared_ptr& rhs) { return ds::less(lhs, rhs); } // [[arrow::export]] -std::shared_ptr dataset___expr__less_equal( +std::shared_ptr dataset___expr__less_equal( const std::shared_ptr& lhs, const std::shared_ptr& rhs) { return ds::less_equal(lhs, rhs); } // [[arrow::export]] -std::shared_ptr dataset___expr__in( +std::shared_ptr dataset___expr__in( const std::shared_ptr& lhs, const std::shared_ptr& rhs) { - return std::make_shared(lhs->In(rhs)); + return lhs->In(rhs).Copy(); Review comment: 🤷 not a big deal to me This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] nevi-me commented on a change in pull request #6306: ARROW-7705: [Rust] Initial sort implementation
nevi-me commented on a change in pull request #6306: URL: https://github.com/apache/arrow/pull/6306#discussion_r414663303 ## File path: rust/arrow/src/compute/kernels/sort.rs ## @@ -0,0 +1,671 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Defines sort kernel for `ArrayRef` + +use std::cmp::Reverse; + +use crate::array::*; +use crate::compute::take; +use crate::datatypes::*; +use crate::error::{ArrowError, Result}; + +use TimeUnit::*; + +/// Sort the `ArrayRef` using `SortOptions`. +/// +/// Performs a stable sort on values and indices, returning nulls after sorted valid values, +/// while preserving the order of the nulls. +/// +/// Returns and `ArrowError::ComputeError(String)` if the array type is either unsupported by `sort_to_indices` or `take`. Review comment: Thanks, I'll merge this after CI completes This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] nevi-me commented on a change in pull request #6306: ARROW-7705: [Rust] Initial sort implementation
nevi-me commented on a change in pull request #6306: URL: https://github.com/apache/arrow/pull/6306#discussion_r414662522 ## File path: rust/arrow/src/compute/kernels/sort.rs ## @@ -0,0 +1,671 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +//! Defines sort kernel for `ArrayRef` + +use std::cmp::Reverse; + +use crate::array::*; +use crate::compute::take; +use crate::datatypes::*; +use crate::error::{ArrowError, Result}; + +use TimeUnit::*; + +/// Sort the `ArrayRef` using `SortOptions`. +/// +/// Performs a stable sort on values and indices, returning nulls after sorted valid values, +/// while preserving the order of the nulls. +/// +/// Returns and `ArrowError::ComputeError(String)` if the array type is either unsupported by `sort_to_indices` or `take`. Review comment: ```suggestion /// Returns an `ArrowError::ComputeError(String)` if the array type is either unsupported by `sort_to_indices` or `take`. ``` This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] nealrichardson commented on pull request #7028: ARROW-8575: [Developer] Add issue_comment workflow to rebase a PR
nealrichardson commented on pull request #7028: URL: https://github.com/apache/arrow/pull/7028#issuecomment-619074972 I'm not worried about security risks in this particular case. If someone random person wants to rebase my PR on apache/arrow@master, great! Now I don't have to! While I see how other workflows that push might be vulnerable, this workflow is not open-ended, does not take any arguments, and it only pushes on success, so I don't understand how it could be exploited. Am I missing something? Involving Python here sounds like overkill. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] rdettai edited a comment on pull request #6949: ARROW-7681: [Rust] Explicitly seeking a BufReader will discard the internal buffer (2)
rdettai edited a comment on pull request #6949: URL: https://github.com/apache/arrow/pull/6949#issuecomment-618997029 > Originally we designed it this way so that we can concurrently read multiple column chunks after obtaining file handle from a single row group. Since the file handle is shared between these we wanted to provide thread safety on top of it. I'm not sure to understand how this could be the concern of the `FileSource`. It's the reader (file handle) that is passed to it that should be thread safe, and `FileSource` should not create the Mutex himself. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] rdettai commented on pull request #6949: ARROW-7681: [Rust] Explicitly seeking a BufReader will discard the internal buffer (2)
rdettai commented on pull request #6949: URL: https://github.com/apache/arrow/pull/6949#issuecomment-618997029 > Originally we designed it this way so that we can concurrently read multiple column chunks after obtaining file handle from a single row group. Since the file handle is shared between these we wanted to provide thread safety on top of it. I'm not sure to understand how this could be the concern of the `FileSource`. Its the reader (file handle) that is passed to it that should be thread safe, and he should not create the Mutex himself. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] fsaintjacques commented on pull request #7022: ARROW-8562: [C++] IO: Parameterize I/O Coalescing using S3 metrics
fsaintjacques commented on pull request #7022: URL: https://github.com/apache/arrow/pull/7022#issuecomment-618985822 Could you accompany a script/utility to compute both metrics? Paired with toxiproxy, we could replicate S3 regions behavior with localhost. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] jianxind commented on a change in pull request #7029: ARROW-8579 [C++] Add AVX512 SIMD for spaced decoding and encoding.
jianxind commented on a change in pull request #7029: URL: https://github.com/apache/arrow/pull/7029#discussion_r414505645 ## File path: cpp/src/arrow/util/spaced.h ## @@ -0,0 +1,266 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/util/bit_util.h" + +#ifdef ARROW_HAVE_AVX512 +#include +#endif // ARROW_HAVE_AVX512 + +namespace arrow { +namespace util { +namespace internal { + +template +int PutSpacedScalar(const T* values, int num_values, const uint8_t* valid_bits, +int64_t valid_bits_offset, T* output) { + int num_valid_values = 0; + arrow::internal::BitmapReader valid_bits_reader(valid_bits, valid_bits_offset, + num_values); + for (int32_t i = 0; i < num_values; i++) { +if (valid_bits_reader.IsSet()) { + output[num_valid_values++] = values[i]; +} +valid_bits_reader.Next(); + } + return num_valid_values; +} + +template +int DecodeSpacedScalar(T* buffer, int num_values, int null_count, + const uint8_t* valid_bits, int64_t valid_bits_offset) { + const int values_read = num_values - null_count; + + // Depending on the number of nulls, some of the value slots in buffer may + // be uninitialized, and this will cause valgrind warnings / potentially UB + memset(static_cast(buffer + values_read), 0, + (num_values - values_read) * sizeof(T)); + + // Add spacing for null entries. As we have filled the buffer from the front, + // we need to add the spacing from the back. + int values_to_move = values_read - 1; + // We stop early on one of two conditions: + // 1. There are no more null values that need spacing. Note we infer this + // backwards, when 'i' is equal to 'values_to_move' it indicates + //all nulls have been consumed. + // 2. There are no more non-null values that need to move which indicates + //all remaining slots are null, so their exact value doesn't matter. + for (int i = num_values - 1; (i > values_to_move) && (values_to_move >= 0); i--) { +if (BitUtil::GetBit(valid_bits, valid_bits_offset + i)) { + buffer[i] = buffer[values_to_move]; + values_to_move--; +} + } + return num_values; +} + +#if defined(ARROW_HAVE_AVX512) +template +int PutSpacedAvx512Compress(const T* values, int num_values, const uint8_t* valid_bits, +int64_t valid_bits_offset, T* output) { + assert(sizeof(T) == 4 || sizeof(T) == 8); // Only support epi32 and epi64 Review comment: Can't. The instantiation PutSpaced of other types still get to this place though it will never be reached. /mnt/github/arrow/cpp/src/arrow/util/spaced.h: In instantiation of 'int arrow::util::internal::PutSpacedAvx512Compress(const T*, int, const uint8_t*, int64_t, T*) [with T = parquet::Int96; uint8_t = unsigned char; int64_t = long int]': /mnt/github/arrow/cpp/src/arrow/util/spaced.h:224:38: required from 'int arrow::util::internal::PutSpacedAvx512(const T*, int, const uint8_t*, int64_t, T*) [with T = parquet::Int96; uint8_t = unsigned char; int64_t = long int]' /mnt/github/arrow/cpp/src/arrow/util/spaced.h:246:28: required from 'int arrow::util::internal::PutSpaced(const T*, int, const uint8_t*, int64_t, T*) [with T = parquet::Int96; uint8_t = unsigned char; int64_t = long int]' /mnt/github/arrow/cpp/src/parquet/encoding.cc:107:63: required from 'void parquet::PlainEncoder::PutSpaced(const T*, int, const uint8_t*, int64_t) [with DType = parquet::PhysicalType<(parquet::Type::type)3>; parquet::PlainEncoder::T = parquet::Int96; uint8_t = unsigned char; int64_t = long int]' This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] kszucs edited a comment on pull request #7028: ARROW-8575: [Developer] Add issue_comment workflow to rebase a PR
kszucs edited a comment on pull request #7028: URL: https://github.com/apache/arrow/pull/7028#issuecomment-618944665 There is another security constraint about this approach: anyone can trigger a rebase on the PR not just the participants / committers. To resolve that you need to check `author_association` in the event payload, see the python bot's implementation [here](https://github.com/apache/arrow/blob/master/dev/archery/archery/bot.py#L179). We could implement a feature to the python bot to trigger bash scripts to make the contributing to the comment bot easier while reusing the existing comment handling and responding mechanism. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] kszucs edited a comment on pull request #7028: ARROW-8575: [Developer] Add issue_comment workflow to rebase a PR
kszucs edited a comment on pull request #7028: URL: https://github.com/apache/arrow/pull/7028#issuecomment-618944665 There is another security constraint about this approach: anyone can trigger a rebase on the PR not just the participants / committers. To resolve that you need to check `author_association` in the event payload, see the python bot's implementation [here](https://github.com/apache/arrow/blob/master/dev/archery/archery/bot.py#L179). This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] kszucs commented on pull request #7028: ARROW-8575: [Developer] Add issue_comment workflow to rebase a PR
kszucs commented on pull request #7028: URL: https://github.com/apache/arrow/pull/7028#issuecomment-618944665 There is another security constraint about this approach: anyone can trigger a rebase on the PR not just the participants. To resolve that you need to check `author_association` in the event payload, see the python bot's implementation [here](https://github.com/apache/arrow/blob/master/dev/archery/archery/bot.py#L179). This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] nevi-me commented on pull request #7018: ARROW-8536: [Rust] [Flight] Check in proto file, conditional build if file exists
nevi-me commented on pull request #7018: URL: https://github.com/apache/arrow/pull/7018#issuecomment-618942929 > @nevi-me This is looking good, but the generated source file needs the ASF header. CI is failing with ` apache-rat license violation: rust/arrow-flight/src/arrow.flight.protocol.rs`. I've instead added an exception, I hope that's fine This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] github-actions[bot] commented on pull request #7030: ARROW-7808: [Java][Dataset] Implement Datasets Java API
github-actions[bot] commented on pull request #7030: URL: https://github.com/apache/arrow/pull/7030#issuecomment-618919224 https://issues.apache.org/jira/browse/ARROW-7808 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] pitrou commented on pull request #7029: ARROW-8579 [C++] Add AVX512 SIMD for spaced decoding and encoding.
pitrou commented on pull request #7029: URL: https://github.com/apache/arrow/pull/7029#issuecomment-618917554 I'd gladly see a AVX2 or SSE version indeed, as many CPUs don't have AVX512. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] zhztheplayer opened a new pull request #7030: ARROW-7808: [Java][Dataset] Implement Datasets Java API
zhztheplayer opened a new pull request #7030: URL: https://github.com/apache/arrow/pull/7030 Add following Datasets APIs to Java: - DatasetFactory - Dataset - Scanner - ScanTask Add a native dataset path to bridge c++ Datasets components to Java: - NativeDatasetFactory - NativeDataset - NativeScanner - NativeScanTask Add a simple reference `NativeDatasetFactory` implementation, which is already being useful in some scenarios: - SingleFileDatasetFactory - SingleFileDataset Unit tests are based on `SingleFileDataset`/`NativeDatasetFactory`. I may add more test cases under this PR in a time. This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] kiszk commented on a change in pull request #7029: ARROW-8579 [C++] Add AVX512 SIMD for spaced decoding and encoding.
kiszk commented on a change in pull request #7029: URL: https://github.com/apache/arrow/pull/7029#discussion_r414434809 ## File path: cpp/src/arrow/util/spaced.h ## @@ -0,0 +1,266 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/util/bit_util.h" + +#ifdef ARROW_HAVE_AVX512 +#include +#endif // ARROW_HAVE_AVX512 + +namespace arrow { +namespace util { +namespace internal { + +template +int PutSpacedScalar(const T* values, int num_values, const uint8_t* valid_bits, +int64_t valid_bits_offset, T* output) { + int num_valid_values = 0; + arrow::internal::BitmapReader valid_bits_reader(valid_bits, valid_bits_offset, + num_values); + for (int32_t i = 0; i < num_values; i++) { +if (valid_bits_reader.IsSet()) { + output[num_valid_values++] = values[i]; +} +valid_bits_reader.Next(); + } + return num_valid_values; +} + +template +int DecodeSpacedScalar(T* buffer, int num_values, int null_count, + const uint8_t* valid_bits, int64_t valid_bits_offset) { + const int values_read = num_values - null_count; + + // Depending on the number of nulls, some of the value slots in buffer may + // be uninitialized, and this will cause valgrind warnings / potentially UB + memset(static_cast(buffer + values_read), 0, + (num_values - values_read) * sizeof(T)); + + // Add spacing for null entries. As we have filled the buffer from the front, + // we need to add the spacing from the back. + int values_to_move = values_read - 1; + // We stop early on one of two conditions: + // 1. There are no more null values that need spacing. Note we infer this + // backwards, when 'i' is equal to 'values_to_move' it indicates + //all nulls have been consumed. + // 2. There are no more non-null values that need to move which indicates + //all remaining slots are null, so their exact value doesn't matter. + for (int i = num_values - 1; (i > values_to_move) && (values_to_move >= 0); i--) { +if (BitUtil::GetBit(valid_bits, valid_bits_offset + i)) { + buffer[i] = buffer[values_to_move]; + values_to_move--; +} + } + return num_values; +} + +#if defined(ARROW_HAVE_AVX512) +template +int PutSpacedAvx512Compress(const T* values, int num_values, const uint8_t* valid_bits, +int64_t valid_bits_offset, T* output) { + assert(sizeof(T) == 4 || sizeof(T) == 8); // Only support epi32 and epi64 Review comment: How about using `staic_assert()`? This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] kiszk commented on a change in pull request #7029: ARROW-8579 [C++] Add AVX512 SIMD for spaced decoding and encoding.
kiszk commented on a change in pull request #7029: URL: https://github.com/apache/arrow/pull/7029#discussion_r414434434 ## File path: cpp/src/arrow/util/spaced.h ## @@ -0,0 +1,266 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#pragma once + +#include "arrow/util/bit_util.h" + +#ifdef ARROW_HAVE_AVX512 +#include +#endif // ARROW_HAVE_AVX512 + +namespace arrow { +namespace util { +namespace internal { + +template +int PutSpacedScalar(const T* values, int num_values, const uint8_t* valid_bits, +int64_t valid_bits_offset, T* output) { + int num_valid_values = 0; + arrow::internal::BitmapReader valid_bits_reader(valid_bits, valid_bits_offset, + num_values); + for (int32_t i = 0; i < num_values; i++) { +if (valid_bits_reader.IsSet()) { + output[num_valid_values++] = values[i]; +} +valid_bits_reader.Next(); + } + return num_valid_values; +} + +template +int DecodeSpacedScalar(T* buffer, int num_values, int null_count, + const uint8_t* valid_bits, int64_t valid_bits_offset) { + const int values_read = num_values - null_count; + + // Depending on the number of nulls, some of the value slots in buffer may + // be uninitialized, and this will cause valgrind warnings / potentially UB + memset(static_cast(buffer + values_read), 0, + (num_values - values_read) * sizeof(T)); + + // Add spacing for null entries. As we have filled the buffer from the front, + // we need to add the spacing from the back. + int values_to_move = values_read - 1; + // We stop early on one of two conditions: + // 1. There are no more null values that need spacing. Note we infer this + // backwards, when 'i' is equal to 'values_to_move' it indicates + //all nulls have been consumed. + // 2. There are no more non-null values that need to move which indicates + //all remaining slots are null, so their exact value doesn't matter. + for (int i = num_values - 1; (i > values_to_move) && (values_to_move >= 0); i--) { +if (BitUtil::GetBit(valid_bits, valid_bits_offset + i)) { + buffer[i] = buffer[values_to_move]; + values_to_move--; +} + } + return num_values; +} + +#if defined(ARROW_HAVE_AVX512) +template +int PutSpacedAvx512Compress(const T* values, int num_values, const uint8_t* valid_bits, +int64_t valid_bits_offset, T* output) { + assert(sizeof(T) == 4 || sizeof(T) == 8); // Only support epi32 and epi64 + constexpr int kBatchSize = sizeof(__m512i) / sizeof(T); + constexpr int kBatchValidBytes = kBatchSize / 8; + int num_valid_values = 0; + int idx_values = 0; + int64_t idx_valid_bits = valid_bits_offset; + + // First handle the front suffix + const int64_t offset_suffix_front = 8 - (valid_bits_offset % 8); + for (int64_t i = 0; (i < offset_suffix_front) && (idx_values < num_values); i++) { +if (BitUtil::GetBit(valid_bits, idx_valid_bits)) { + output[num_valid_values] = values[idx_values]; + num_valid_values++; +} +idx_values++; +idx_valid_bits++; + } + + // The parts can fill into batches + uint8_t valid_count; + int64_t idx_valid_bytes = BitUtil::BytesForBits(idx_valid_bits + 1) - 1; + static const __m512i zero = _mm512_set_epi64(0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0); + while (num_values - idx_values >= kBatchSize) { +// count the valid numbers of one batch. +valid_count = BitUtil::kBytePopcount[valid_bits[idx_valid_bytes]]; +if (kBatchValidBytes > 1) { + valid_count += BitUtil::kBytePopcount[valid_bits[idx_valid_bytes + 1]]; +} + +// pack the data +if (valid_count > 0) { + __m512i src = _mm512_loadu_si512(values + idx_values); + __m512i result; + if (sizeof(T) == 4) { +// 16 float for one m512i block, two bytes in valid_bits +__mmask16 k = *(reinterpret_cast(valid_bits + idx_valid_bytes)); +result = _mm512_mask_compress_epi32(zero, k, src); + } else { +// 8 double for one m512i block, one byte in valid_bits +__mmask8 k = *(valid_bits + idx_valid_bytes); +result = _mm512_mask_compress_epi64(zero, k, src); + } + +
[GitHub] [arrow] rdettai commented on a change in pull request #6935: ARROW-8455: [Rust] Parquet Arrow column read on partially compatible files
rdettai commented on a change in pull request #6935: URL: https://github.com/apache/arrow/pull/6935#discussion_r414417688 ## File path: rust/parquet/src/column/reader.rs ## @@ -190,15 +190,12 @@ impl ColumnReaderImpl { (self.num_buffered_values - self.num_decoded_values) as usize, ); -// Adjust batch size by taking into account how much space is left in -// values slice or levels slices (if available) -adjusted_size = min(adjusted_size, values.len() - values_read); -if let Some(ref levels) = def_levels { -adjusted_size = min(adjusted_size, levels.len() - levels_read); -} -if let Some(ref levels) = rep_levels { -adjusted_size = min(adjusted_size, levels.len() - levels_read); -} +// Adjust batch size by taking into account how much data there +// to read. As batch_size is also smaller than value and level +// slices (if available), this ensures that available space is not +// exceeded. +adjusted_size = min(adjusted_size, batch_size - values_read); Review comment: As stated in my PR comment, the `read_batch` function can receive any combination of `batch_size`, `def_levels.len()`, `rep_levels.len()` and `values.len()`. If the `batch_size` is the limiting factor your `iter_batch_size` might end up larger than the `batch_size`. This happened to me when called from `record_reader.rs` on a parquet file with relatively small row groups (500k rows). I did not manage to reproduce the phenomenon with a mock data file though ... :-/ This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] jianxind commented on pull request #7029: ARROW-8579 [C++] Add AVX512 SIMD for spaced decoding and encoding.
jianxind commented on pull request #7029: URL: https://github.com/apache/arrow/pull/7029#issuecomment-618855696 cc @emkornfield The AVX512 path is straightforward as the helper of mask_compress/mask_expand API provide by AVX512. For potential path-finding of SSE/AVX2, as you pointed in the Jira, a solution with fixed lookup table may help, I will work the chance then but it definitely need take more time thus I commit this done part firstly. Below is the benchmark data on Avx512 device before after the intrinsics: Before: BM_PlainEncodingSpacedFloat/1024 1471 ns 1469 ns 476373 bytes_per_second=2.59603G/s BM_PlainEncodingSpacedDouble/1024 1498 ns 1496 ns 468131 bytes_per_second=5.09834G/s BM_PlainDecodingSpacedFloat/1024 1266 ns 1265 ns 554320 bytes_per_second=3.01623G/s BM_PlainDecodingSpacedDouble/1024 920 ns 919 ns 759151 bytes_per_second=8.30509G/s After: BM_PlainEncodingSpacedFloat/1024 717 ns 716 ns 973249 bytes_per_second=5.32424G/s BM_PlainEncodingSpacedDouble/1024 989 ns 988 ns 710096 bytes_per_second=7.7235G/s BM_PlainDecodingSpacedFloat/1024 217 ns 217 ns 3233406 bytes_per_second=17.613G/s BM_PlainDecodingSpacedDouble/1024 309 ns 309 ns 2267740 bytes_per_second=24.7257G/s This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] github-actions[bot] commented on pull request #7029: ARROW-8579 [C++] Add AVX512 SIMD for spaced decoding and encoding.
github-actions[bot] commented on pull request #7029: URL: https://github.com/apache/arrow/pull/7029#issuecomment-618853399 https://issues.apache.org/jira/browse/ARROW-8579 This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org
[GitHub] [arrow] jianxind opened a new pull request #7029: ARROW-8579 [C++] Add AVX512 SIMD for spaced decoding and encoding.
jianxind opened a new pull request #7029: URL: https://github.com/apache/arrow/pull/7029 1. Create the spaced encoding/decoding benchmark items. 2. Create unittest for spaced API SIMD implementation. 3. Move spaced scalar/SIMD to a new head file. 4. Add the path of AVX512 epi32 and epi64 using mask_compress_/mask_expand_ API. Signed-off-by: Frank Du This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. For queries about this service, please contact Infrastructure at: us...@infra.apache.org